{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "global_step": 9036, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.6764705882352945e-08, "logits/chosen": -0.2862769663333893, "logits/rejected": -0.13911651074886322, "logps/chosen": -74.69404602050781, "logps/rejected": -115.48554229736328, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 7.352941176470589e-08, "logits/chosen": -0.377959281206131, "logits/rejected": -0.34203004837036133, "logps/chosen": -126.82559967041016, "logps/rejected": -60.5023307800293, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0, "learning_rate": 1.1029411764705884e-07, "logits/chosen": -0.12902657687664032, "logits/rejected": -0.1033184826374054, "logps/chosen": -132.51626586914062, "logps/rejected": -150.982666015625, "loss": 0.7412, "rewards/accuracies": 0.0, "rewards/chosen": -0.03916015848517418, "rewards/margins": -0.1555938720703125, "rewards/rejected": 0.11643371731042862, "step": 3 }, { "epoch": 0.0, "learning_rate": 1.4705882352941178e-07, "logits/chosen": -0.4978506863117218, "logits/rejected": -0.5146324038505554, "logps/chosen": -158.593017578125, "logps/rejected": -203.69113159179688, "loss": 0.7213, "rewards/accuracies": 0.0, "rewards/chosen": 0.03021087683737278, "rewards/margins": -0.05355682969093323, "rewards/rejected": 0.08376770466566086, "step": 4 }, { "epoch": 0.0, "learning_rate": 1.8382352941176472e-07, "logits/chosen": -0.5148776769638062, "logits/rejected": -0.46344611048698425, "logps/chosen": -97.68838500976562, "logps/rejected": -67.17710876464844, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.00860595703125, "rewards/margins": 0.0035858154296875, "rewards/rejected": 0.0050201416015625, "step": 5 }, { "epoch": 0.0, "learning_rate": 2.2058823529411768e-07, "logits/chosen": -0.6777510643005371, "logits/rejected": -0.6384617686271667, "logps/chosen": -179.42730712890625, "logps/rejected": -107.46916198730469, "loss": 0.6639, "rewards/accuracies": 0.0, "rewards/chosen": -0.0304718017578125, "rewards/margins": -0.02165374718606472, "rewards/rejected": -0.00881805457174778, "step": 6 }, { "epoch": 0.0, "learning_rate": 2.573529411764706e-07, "logits/chosen": -0.20640161633491516, "logits/rejected": -0.19215260446071625, "logps/chosen": -35.80889892578125, "logps/rejected": -82.40386962890625, "loss": 0.707, "rewards/accuracies": 0.0, "rewards/chosen": -0.02021637000143528, "rewards/margins": -0.055727384984493256, "rewards/rejected": 0.035511016845703125, "step": 7 }, { "epoch": 0.0, "learning_rate": 2.9411764705882356e-07, "logits/chosen": 0.06648331880569458, "logits/rejected": 0.11190025508403778, "logps/chosen": -86.77867126464844, "logps/rejected": -98.81971740722656, "loss": 0.6976, "rewards/accuracies": 0.0, "rewards/chosen": -0.021921539679169655, "rewards/margins": -0.02449646033346653, "rewards/rejected": 0.002574920654296875, "step": 8 }, { "epoch": 0.0, "learning_rate": 3.308823529411765e-07, "logits/chosen": -0.6960507035255432, "logits/rejected": -0.6832364797592163, "logps/chosen": -173.53387451171875, "logps/rejected": -138.87692260742188, "loss": 0.6724, "rewards/accuracies": 1.0, "rewards/chosen": -0.012957763858139515, "rewards/margins": 0.00218963623046875, "rewards/rejected": -0.015147400088608265, "step": 9 }, { "epoch": 0.0, "learning_rate": 3.6764705882352943e-07, "logits/chosen": -0.48838070034980774, "logits/rejected": -0.5160732269287109, "logps/chosen": -85.56555938720703, "logps/rejected": -74.6715087890625, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.030091095715761185, "rewards/margins": 0.005922699347138405, "rewards/rejected": 0.02416839636862278, "step": 10 }, { "epoch": 0.0, "learning_rate": 4.044117647058824e-07, "logits/chosen": -0.39392635226249695, "logits/rejected": -0.41000896692276, "logps/chosen": -59.737003326416016, "logps/rejected": -56.84599304199219, "loss": 0.6704, "rewards/accuracies": 1.0, "rewards/chosen": 0.04163971170783043, "rewards/margins": 0.05499344319105148, "rewards/rejected": -0.013353729620575905, "step": 11 }, { "epoch": 0.0, "learning_rate": 4.4117647058823536e-07, "logits/chosen": -0.4090505838394165, "logits/rejected": -0.37530606985092163, "logps/chosen": -134.25753784179688, "logps/rejected": -89.43792724609375, "loss": 0.6678, "rewards/accuracies": 1.0, "rewards/chosen": 0.05974731594324112, "rewards/margins": 0.021756745874881744, "rewards/rejected": 0.037990570068359375, "step": 12 }, { "epoch": 0.0, "learning_rate": 4.779411764705882e-07, "logits/chosen": -0.25680819153785706, "logits/rejected": -0.25680819153785706, "logps/chosen": -128.91751098632812, "logps/rejected": -128.91751098632812, "loss": 0.6997, "rewards/accuracies": 0.0, "rewards/chosen": 0.02039794996380806, "rewards/margins": 0.0, "rewards/rejected": 0.02039794996380806, "step": 13 }, { "epoch": 0.0, "learning_rate": 5.147058823529412e-07, "logits/chosen": -0.3725745379924774, "logits/rejected": -0.37210261821746826, "logps/chosen": -85.25466918945312, "logps/rejected": -83.14166259765625, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.051519013941287994, "rewards/margins": 0.07214432209730148, "rewards/rejected": -0.02062530629336834, "step": 14 }, { "epoch": 0.0, "learning_rate": 5.514705882352942e-07, "logits/chosen": -0.4763123691082001, "logits/rejected": -0.46601516008377075, "logps/chosen": -64.50300598144531, "logps/rejected": -58.23624801635742, "loss": 0.7159, "rewards/accuracies": 0.0, "rewards/chosen": -0.011529541574418545, "rewards/margins": -0.020402908325195312, "rewards/rejected": 0.008873367682099342, "step": 15 }, { "epoch": 0.0, "learning_rate": 5.882352941176471e-07, "logits/chosen": -0.19638612866401672, "logits/rejected": -0.13606543838977814, "logps/chosen": -76.5377197265625, "logps/rejected": -72.75706481933594, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.02674102783203125, "rewards/margins": 0.01382370013743639, "rewards/rejected": 0.01291732769459486, "step": 16 }, { "epoch": 0.0, "learning_rate": 6.25e-07, "logits/chosen": -0.33750197291374207, "logits/rejected": -0.33750197291374207, "logps/chosen": -97.32269287109375, "logps/rejected": -97.32269287109375, "loss": 0.6909, "rewards/accuracies": 0.0, "rewards/chosen": 0.009454346261918545, "rewards/margins": 0.0, "rewards/rejected": 0.009454346261918545, "step": 17 }, { "epoch": 0.0, "learning_rate": 6.61764705882353e-07, "logits/chosen": -0.35621631145477295, "logits/rejected": -0.35621631145477295, "logps/chosen": -79.79634094238281, "logps/rejected": -79.79634094238281, "loss": 0.6929, "rewards/accuracies": 0.0, "rewards/chosen": -0.03781280666589737, "rewards/margins": 0.0, "rewards/rejected": -0.03781280666589737, "step": 18 }, { "epoch": 0.0, "learning_rate": 6.985294117647059e-07, "logits/chosen": -0.24497543275356293, "logits/rejected": -0.21977068483829498, "logps/chosen": -83.06480407714844, "logps/rejected": -61.59651184082031, "loss": 0.6654, "rewards/accuracies": 1.0, "rewards/chosen": 0.0687103271484375, "rewards/margins": 0.06127510219812393, "rewards/rejected": 0.00743522634729743, "step": 19 }, { "epoch": 0.0, "learning_rate": 7.352941176470589e-07, "logits/chosen": -0.6613814234733582, "logits/rejected": -0.6373815536499023, "logps/chosen": -101.62416076660156, "logps/rejected": -109.42691802978516, "loss": 0.6762, "rewards/accuracies": 0.0, "rewards/chosen": 0.03210296854376793, "rewards/margins": -0.04192962870001793, "rewards/rejected": 0.07403259724378586, "step": 20 }, { "epoch": 0.0, "learning_rate": 7.720588235294119e-07, "logits/chosen": -0.5211349129676819, "logits/rejected": -0.5090523362159729, "logps/chosen": -147.78221130371094, "logps/rejected": -89.99435424804688, "loss": 0.6084, "rewards/accuracies": 1.0, "rewards/chosen": 0.09206543117761612, "rewards/margins": 0.055335234850645065, "rewards/rejected": 0.036730196326971054, "step": 21 }, { "epoch": 0.0, "learning_rate": 8.088235294117648e-07, "logits/chosen": -0.4309721291065216, "logits/rejected": -0.41999080777168274, "logps/chosen": -33.046897888183594, "logps/rejected": -29.594646453857422, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.034893035888671875, "rewards/margins": 0.060076139867305756, "rewards/rejected": -0.02518310584127903, "step": 22 }, { "epoch": 0.01, "learning_rate": 8.455882352941178e-07, "logits/chosen": -1.5346958637237549, "logits/rejected": -1.5252267122268677, "logps/chosen": -321.493896484375, "logps/rejected": -188.554443359375, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.0279541015625, "rewards/margins": -0.05325927585363388, "rewards/rejected": 0.08121337741613388, "step": 23 }, { "epoch": 0.01, "learning_rate": 8.823529411764707e-07, "logits/chosen": -0.167852982878685, "logits/rejected": -0.1597452461719513, "logps/chosen": -71.6339340209961, "logps/rejected": -54.579795837402344, "loss": 0.6619, "rewards/accuracies": 1.0, "rewards/chosen": 0.08475494384765625, "rewards/margins": 0.07678985595703125, "rewards/rejected": 0.007965087890625, "step": 24 }, { "epoch": 0.01, "learning_rate": 9.191176470588237e-07, "logits/chosen": -0.8656083345413208, "logits/rejected": -0.9113852977752686, "logps/chosen": -79.13139343261719, "logps/rejected": -16.214139938354492, "loss": 0.6592, "rewards/accuracies": 1.0, "rewards/chosen": 0.015854645520448685, "rewards/margins": 0.01802959479391575, "rewards/rejected": -0.002174949739128351, "step": 25 }, { "epoch": 0.01, "learning_rate": 9.558823529411764e-07, "logits/chosen": -0.4461967349052429, "logits/rejected": -0.3997352719306946, "logps/chosen": -37.4488410949707, "logps/rejected": -83.61023712158203, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.07547912746667862, "rewards/margins": 0.020496368408203125, "rewards/rejected": 0.054982759058475494, "step": 26 }, { "epoch": 0.01, "learning_rate": 9.926470588235295e-07, "logits/chosen": -0.09553100913763046, "logits/rejected": -0.06243213638663292, "logps/chosen": -105.97319793701172, "logps/rejected": -141.45016479492188, "loss": 0.6905, "rewards/accuracies": 0.0, "rewards/chosen": 0.11934051662683487, "rewards/margins": -0.04889220744371414, "rewards/rejected": 0.168232724070549, "step": 27 }, { "epoch": 0.01, "learning_rate": 1.0294117647058825e-06, "logits/chosen": -0.11800441145896912, "logits/rejected": -0.10121109336614609, "logps/chosen": -64.95045471191406, "logps/rejected": -101.42567443847656, "loss": 0.6354, "rewards/accuracies": 0.0, "rewards/chosen": 0.09806670993566513, "rewards/margins": -0.013580322265625, "rewards/rejected": 0.11164703220129013, "step": 28 }, { "epoch": 0.01, "learning_rate": 1.0661764705882354e-06, "logits/chosen": -0.5726810693740845, "logits/rejected": -0.5546489357948303, "logps/chosen": -140.19992065429688, "logps/rejected": -222.45706176757812, "loss": 0.7417, "rewards/accuracies": 0.0, "rewards/chosen": 0.108367919921875, "rewards/margins": -0.18574830889701843, "rewards/rejected": 0.29411622881889343, "step": 29 }, { "epoch": 0.01, "learning_rate": 1.1029411764705884e-06, "logits/chosen": -0.2618142366409302, "logits/rejected": -0.2797282040119171, "logps/chosen": -144.19589233398438, "logps/rejected": -96.92137145996094, "loss": 0.6073, "rewards/accuracies": 1.0, "rewards/chosen": 0.20260925590991974, "rewards/margins": 0.10119934380054474, "rewards/rejected": 0.101409912109375, "step": 30 }, { "epoch": 0.01, "learning_rate": 1.1397058823529413e-06, "logits/chosen": -0.34878969192504883, "logits/rejected": -0.3501427173614502, "logps/chosen": -96.84081268310547, "logps/rejected": -102.65656280517578, "loss": 0.6169, "rewards/accuracies": 1.0, "rewards/chosen": 0.107701875269413, "rewards/margins": 0.0031982436776161194, "rewards/rejected": 0.10450363159179688, "step": 31 }, { "epoch": 0.01, "learning_rate": 1.1764705882352942e-06, "logits/chosen": -0.4251815676689148, "logits/rejected": -0.029489390552043915, "logps/chosen": -151.92327880859375, "logps/rejected": -182.44970703125, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.13740845024585724, "rewards/margins": 0.09699402004480362, "rewards/rejected": 0.04041443020105362, "step": 32 }, { "epoch": 0.01, "learning_rate": 1.2132352941176472e-06, "logits/chosen": -0.5812998414039612, "logits/rejected": -0.5766876935958862, "logps/chosen": -86.65283966064453, "logps/rejected": -73.20912170410156, "loss": 0.6406, "rewards/accuracies": 1.0, "rewards/chosen": 0.20074462890625, "rewards/margins": 0.04750289022922516, "rewards/rejected": 0.15324173867702484, "step": 33 }, { "epoch": 0.01, "learning_rate": 1.25e-06, "logits/chosen": -0.34923455119132996, "logits/rejected": -0.3559107482433319, "logps/chosen": -113.23468017578125, "logps/rejected": -147.78846740722656, "loss": 0.6624, "rewards/accuracies": 1.0, "rewards/chosen": 0.3566032350063324, "rewards/margins": 0.06915509700775146, "rewards/rejected": 0.28744813799858093, "step": 34 }, { "epoch": 0.01, "learning_rate": 1.2867647058823528e-06, "logits/chosen": -0.5602127313613892, "logits/rejected": -0.557860255241394, "logps/chosen": -103.6727294921875, "logps/rejected": -163.81642150878906, "loss": 0.6137, "rewards/accuracies": 1.0, "rewards/chosen": 0.4052780270576477, "rewards/margins": 0.008453369140625, "rewards/rejected": 0.3968246579170227, "step": 35 }, { "epoch": 0.01, "learning_rate": 1.323529411764706e-06, "logits/chosen": -0.22402571141719818, "logits/rejected": -0.1790965497493744, "logps/chosen": -169.83033752441406, "logps/rejected": -112.68927764892578, "loss": 0.624, "rewards/accuracies": 1.0, "rewards/chosen": 0.37838441133499146, "rewards/margins": 0.1479850858449936, "rewards/rejected": 0.23039932548999786, "step": 36 }, { "epoch": 0.01, "learning_rate": 1.360294117647059e-06, "logits/chosen": -0.2856294810771942, "logits/rejected": -0.1915602833032608, "logps/chosen": -133.238525390625, "logps/rejected": -142.2467041015625, "loss": 0.6369, "rewards/accuracies": 1.0, "rewards/chosen": 0.3439193665981293, "rewards/margins": 0.13199156522750854, "rewards/rejected": 0.21192780137062073, "step": 37 }, { "epoch": 0.01, "learning_rate": 1.3970588235294119e-06, "logits/chosen": -0.8774591088294983, "logits/rejected": -0.9428436756134033, "logps/chosen": -89.92835235595703, "logps/rejected": -41.406837463378906, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.1061805710196495, "rewards/margins": 0.08187179267406464, "rewards/rejected": 0.02430877648293972, "step": 38 }, { "epoch": 0.01, "learning_rate": 1.4338235294117648e-06, "logits/chosen": -0.3430505096912384, "logits/rejected": -0.3452320992946625, "logps/chosen": -144.35665893554688, "logps/rejected": -116.88737487792969, "loss": 0.6976, "rewards/accuracies": 1.0, "rewards/chosen": 0.27606201171875, "rewards/margins": 0.03715667128562927, "rewards/rejected": 0.23890534043312073, "step": 39 }, { "epoch": 0.01, "learning_rate": 1.4705882352941177e-06, "logits/chosen": -0.49800771474838257, "logits/rejected": -0.5235397219657898, "logps/chosen": -69.4417495727539, "logps/rejected": -63.09830856323242, "loss": 0.7175, "rewards/accuracies": 0.0, "rewards/chosen": 0.11406173557043076, "rewards/margins": -0.09522133320569992, "rewards/rejected": 0.20928306877613068, "step": 40 }, { "epoch": 0.01, "learning_rate": 1.5073529411764707e-06, "logits/chosen": -0.809604823589325, "logits/rejected": -0.8698219060897827, "logps/chosen": -163.6959228515625, "logps/rejected": -30.369979858398438, "loss": 0.4958, "rewards/accuracies": 1.0, "rewards/chosen": 0.496145635843277, "rewards/margins": 0.46072426438331604, "rewards/rejected": 0.03542137145996094, "step": 41 }, { "epoch": 0.01, "learning_rate": 1.5441176470588238e-06, "logits/chosen": -0.3099535405635834, "logits/rejected": -0.2740246653556824, "logps/chosen": -87.58995056152344, "logps/rejected": -68.76277160644531, "loss": 0.7296, "rewards/accuracies": 0.0, "rewards/chosen": 0.10394745320081711, "rewards/margins": -0.07168731838464737, "rewards/rejected": 0.17563477158546448, "step": 42 }, { "epoch": 0.01, "learning_rate": 1.5808823529411765e-06, "logits/chosen": -0.3989619314670563, "logits/rejected": -0.36485755443573, "logps/chosen": -104.60716247558594, "logps/rejected": -187.61788940429688, "loss": 0.6922, "rewards/accuracies": 0.0, "rewards/chosen": 0.6732773184776306, "rewards/margins": -0.19122618436813354, "rewards/rejected": 0.8645035028457642, "step": 43 }, { "epoch": 0.01, "learning_rate": 1.6176470588235297e-06, "logits/chosen": -0.15126995742321014, "logits/rejected": -0.15126995742321014, "logps/chosen": -37.75403594970703, "logps/rejected": -37.75403594970703, "loss": 0.8157, "rewards/accuracies": 0.0, "rewards/chosen": 0.2281719297170639, "rewards/margins": 0.0, "rewards/rejected": 0.2281719297170639, "step": 44 }, { "epoch": 0.01, "learning_rate": 1.6544117647058824e-06, "logits/chosen": -0.20284157991409302, "logits/rejected": -0.22757506370544434, "logps/chosen": -47.04741287231445, "logps/rejected": -70.15644836425781, "loss": 0.7109, "rewards/accuracies": 0.0, "rewards/chosen": 0.19586677849292755, "rewards/margins": -0.040765002369880676, "rewards/rejected": 0.23663178086280823, "step": 45 }, { "epoch": 0.01, "learning_rate": 1.6911764705882356e-06, "logits/chosen": -0.5186604261398315, "logits/rejected": -0.5216479897499084, "logps/chosen": -209.47291564941406, "logps/rejected": -56.534942626953125, "loss": 0.5904, "rewards/accuracies": 1.0, "rewards/chosen": 0.7271530032157898, "rewards/margins": 0.2819885015487671, "rewards/rejected": 0.4451645016670227, "step": 46 }, { "epoch": 0.01, "learning_rate": 1.7279411764705883e-06, "logits/chosen": -0.3782798945903778, "logits/rejected": -0.37512606382369995, "logps/chosen": -56.298583984375, "logps/rejected": -81.95504760742188, "loss": 0.6706, "rewards/accuracies": 0.0, "rewards/chosen": 0.24536362290382385, "rewards/margins": -0.16656875610351562, "rewards/rejected": 0.4119323790073395, "step": 47 }, { "epoch": 0.01, "learning_rate": 1.7647058823529414e-06, "logits/chosen": -0.7044102549552917, "logits/rejected": -0.6114411354064941, "logps/chosen": -145.71104431152344, "logps/rejected": -79.15966033935547, "loss": 0.5663, "rewards/accuracies": 0.0, "rewards/chosen": 0.5043655633926392, "rewards/margins": -0.0955963134765625, "rewards/rejected": 0.5999618768692017, "step": 48 }, { "epoch": 0.01, "learning_rate": 1.8014705882352942e-06, "logits/chosen": -0.12500524520874023, "logits/rejected": -0.12813208997249603, "logps/chosen": -53.61009216308594, "logps/rejected": -54.3386344909668, "loss": 0.5863, "rewards/accuracies": 1.0, "rewards/chosen": 0.4285522401332855, "rewards/margins": 0.0891345739364624, "rewards/rejected": 0.3394176661968231, "step": 49 }, { "epoch": 0.01, "learning_rate": 1.8382352941176473e-06, "logits/chosen": -0.01554735004901886, "logits/rejected": -0.029375441372394562, "logps/chosen": -87.3914566040039, "logps/rejected": -87.76839447021484, "loss": 0.6747, "rewards/accuracies": 0.0, "rewards/chosen": 0.6201645135879517, "rewards/margins": -0.07663875818252563, "rewards/rejected": 0.6968032717704773, "step": 50 }, { "epoch": 0.01, "learning_rate": 1.8750000000000003e-06, "logits/chosen": -0.6702678203582764, "logits/rejected": -0.6783691644668579, "logps/chosen": -133.93765258789062, "logps/rejected": -138.970458984375, "loss": 0.4982, "rewards/accuracies": 1.0, "rewards/chosen": 1.058955430984497, "rewards/margins": 0.30076754093170166, "rewards/rejected": 0.7581878900527954, "step": 51 }, { "epoch": 0.01, "learning_rate": 1.9117647058823528e-06, "logits/chosen": -0.5911128520965576, "logits/rejected": -0.5847605466842651, "logps/chosen": -74.89207458496094, "logps/rejected": -179.65582275390625, "loss": 0.9107, "rewards/accuracies": 0.0, "rewards/chosen": 0.5489349365234375, "rewards/margins": -0.7937484979629517, "rewards/rejected": 1.3426834344863892, "step": 52 }, { "epoch": 0.01, "learning_rate": 1.948529411764706e-06, "logits/chosen": -0.7097669839859009, "logits/rejected": -0.683347225189209, "logps/chosen": -162.09793090820312, "logps/rejected": -80.62094116210938, "loss": 0.5272, "rewards/accuracies": 1.0, "rewards/chosen": 1.0306427478790283, "rewards/margins": 0.25803226232528687, "rewards/rejected": 0.7726104855537415, "step": 53 }, { "epoch": 0.01, "learning_rate": 1.985294117647059e-06, "logits/chosen": 0.07668696343898773, "logits/rejected": 0.10501103103160858, "logps/chosen": -64.32638549804688, "logps/rejected": -102.13880920410156, "loss": 0.6175, "rewards/accuracies": 0.0, "rewards/chosen": 0.7453300356864929, "rewards/margins": -0.016150712966918945, "rewards/rejected": 0.7614807486534119, "step": 54 }, { "epoch": 0.01, "learning_rate": 2.022058823529412e-06, "logits/chosen": -0.6471552848815918, "logits/rejected": -0.6042605042457581, "logps/chosen": -201.36727905273438, "logps/rejected": -42.375160217285156, "loss": 0.3277, "rewards/accuracies": 1.0, "rewards/chosen": 1.6312774419784546, "rewards/margins": 1.4980331659317017, "rewards/rejected": 0.1332443207502365, "step": 55 }, { "epoch": 0.01, "learning_rate": 2.058823529411765e-06, "logits/chosen": -0.530649721622467, "logits/rejected": -0.44651728868484497, "logps/chosen": -154.30967712402344, "logps/rejected": -39.637332916259766, "loss": 0.389, "rewards/accuracies": 1.0, "rewards/chosen": 1.6610885858535767, "rewards/margins": 1.603999376296997, "rewards/rejected": 0.05708923563361168, "step": 56 }, { "epoch": 0.01, "learning_rate": 2.095588235294118e-06, "logits/chosen": -0.6790821552276611, "logits/rejected": -0.6206755042076111, "logps/chosen": -83.65850830078125, "logps/rejected": -181.68260192871094, "loss": 0.9949, "rewards/accuracies": 0.0, "rewards/chosen": 0.8471435904502869, "rewards/margins": -0.848777711391449, "rewards/rejected": 1.6959213018417358, "step": 57 }, { "epoch": 0.01, "learning_rate": 2.132352941176471e-06, "logits/chosen": -0.49577826261520386, "logits/rejected": -0.42287755012512207, "logps/chosen": -64.8248291015625, "logps/rejected": -160.8303985595703, "loss": 0.6002, "rewards/accuracies": 1.0, "rewards/chosen": 1.2276413440704346, "rewards/margins": 0.04257810115814209, "rewards/rejected": 1.1850632429122925, "step": 58 }, { "epoch": 0.01, "learning_rate": 2.1691176470588238e-06, "logits/chosen": -0.32407140731811523, "logits/rejected": -0.30208244919776917, "logps/chosen": -47.898197174072266, "logps/rejected": -14.152242660522461, "loss": 0.6719, "rewards/accuracies": 1.0, "rewards/chosen": 0.7197811007499695, "rewards/margins": 0.5149282217025757, "rewards/rejected": 0.2048528641462326, "step": 59 }, { "epoch": 0.01, "learning_rate": 2.2058823529411767e-06, "logits/chosen": -0.18223842978477478, "logits/rejected": -0.10849008709192276, "logps/chosen": -80.65811157226562, "logps/rejected": -144.87191772460938, "loss": 0.8787, "rewards/accuracies": 0.0, "rewards/chosen": 1.3736778497695923, "rewards/margins": -0.8641074895858765, "rewards/rejected": 2.2377853393554688, "step": 60 }, { "epoch": 0.01, "learning_rate": 2.2426470588235296e-06, "logits/chosen": -0.7176641821861267, "logits/rejected": -0.6535364389419556, "logps/chosen": -199.01234436035156, "logps/rejected": -92.09306335449219, "loss": 0.3797, "rewards/accuracies": 1.0, "rewards/chosen": 1.6605300903320312, "rewards/margins": 0.7527122497558594, "rewards/rejected": 0.9078178405761719, "step": 61 }, { "epoch": 0.01, "learning_rate": 2.2794117647058826e-06, "logits/chosen": -0.5485034584999084, "logits/rejected": -0.5311028957366943, "logps/chosen": -47.82697296142578, "logps/rejected": -78.22639465332031, "loss": 0.5802, "rewards/accuracies": 1.0, "rewards/chosen": 0.9343299865722656, "rewards/margins": 0.15462112426757812, "rewards/rejected": 0.7797088623046875, "step": 62 }, { "epoch": 0.01, "learning_rate": 2.3161764705882355e-06, "logits/chosen": -0.12410567700862885, "logits/rejected": -0.07791145890951157, "logps/chosen": -52.10743713378906, "logps/rejected": -100.11622619628906, "loss": 0.4598, "rewards/accuracies": 1.0, "rewards/chosen": 0.9057388305664062, "rewards/margins": 0.007455408573150635, "rewards/rejected": 0.8982834219932556, "step": 63 }, { "epoch": 0.01, "learning_rate": 2.3529411764705885e-06, "logits/chosen": -0.4417186379432678, "logits/rejected": -0.4308227598667145, "logps/chosen": -80.4395980834961, "logps/rejected": -23.934906005859375, "loss": 1.0617, "rewards/accuracies": 1.0, "rewards/chosen": 0.5871421694755554, "rewards/margins": 0.4254625141620636, "rewards/rejected": 0.16167965531349182, "step": 64 }, { "epoch": 0.01, "learning_rate": 2.3897058823529414e-06, "logits/chosen": -0.5072620511054993, "logits/rejected": -0.5373456478118896, "logps/chosen": -65.43241882324219, "logps/rejected": -26.034265518188477, "loss": 0.5754, "rewards/accuracies": 1.0, "rewards/chosen": 1.2034873962402344, "rewards/margins": 1.058419942855835, "rewards/rejected": 0.14506740868091583, "step": 65 }, { "epoch": 0.01, "learning_rate": 2.4264705882352943e-06, "logits/chosen": -0.45285549759864807, "logits/rejected": 0.07864748686552048, "logps/chosen": -108.47547149658203, "logps/rejected": -153.0368194580078, "loss": 0.7588, "rewards/accuracies": 0.0, "rewards/chosen": 0.5240532159805298, "rewards/margins": -0.362615168094635, "rewards/rejected": 0.8866683840751648, "step": 66 }, { "epoch": 0.01, "learning_rate": 2.4632352941176473e-06, "logits/chosen": -0.30022233724594116, "logits/rejected": -0.28886932134628296, "logps/chosen": -76.45668029785156, "logps/rejected": -72.07412719726562, "loss": 0.6376, "rewards/accuracies": 0.0, "rewards/chosen": 1.435786485671997, "rewards/margins": -0.049471259117126465, "rewards/rejected": 1.4852577447891235, "step": 67 }, { "epoch": 0.02, "learning_rate": 2.5e-06, "logits/chosen": -0.5487456321716309, "logits/rejected": -0.48390519618988037, "logps/chosen": -43.579551696777344, "logps/rejected": -83.1786880493164, "loss": 0.8614, "rewards/accuracies": 1.0, "rewards/chosen": 0.2297370880842209, "rewards/margins": 0.016744986176490784, "rewards/rejected": 0.2129921019077301, "step": 68 }, { "epoch": 0.02, "learning_rate": 2.536764705882353e-06, "logits/chosen": -0.6825808882713318, "logits/rejected": -0.6638941168785095, "logps/chosen": -134.55679321289062, "logps/rejected": -99.35308074951172, "loss": 0.4448, "rewards/accuracies": 1.0, "rewards/chosen": 1.4698609113693237, "rewards/margins": 0.30113840103149414, "rewards/rejected": 1.1687225103378296, "step": 69 }, { "epoch": 0.02, "learning_rate": 2.5735294117647057e-06, "logits/chosen": -0.14001043140888214, "logits/rejected": -0.14746491611003876, "logps/chosen": -67.65711212158203, "logps/rejected": -51.17631530761719, "loss": 0.5553, "rewards/accuracies": 0.0, "rewards/chosen": 1.2017112970352173, "rewards/margins": -0.09556353092193604, "rewards/rejected": 1.2972748279571533, "step": 70 }, { "epoch": 0.02, "learning_rate": 2.610294117647059e-06, "logits/chosen": -0.6061571836471558, "logits/rejected": -0.5702472925186157, "logps/chosen": -172.7738037109375, "logps/rejected": -85.87664794921875, "loss": 0.4698, "rewards/accuracies": 1.0, "rewards/chosen": 1.8749268054962158, "rewards/margins": 0.01046597957611084, "rewards/rejected": 1.864460825920105, "step": 71 }, { "epoch": 0.02, "learning_rate": 2.647058823529412e-06, "logits/chosen": -0.611802875995636, "logits/rejected": -0.6048727631568909, "logps/chosen": -62.5828857421875, "logps/rejected": -134.44984436035156, "loss": 0.5166, "rewards/accuracies": 1.0, "rewards/chosen": 1.6462920904159546, "rewards/margins": 0.4057159423828125, "rewards/rejected": 1.240576148033142, "step": 72 }, { "epoch": 0.02, "learning_rate": 2.683823529411765e-06, "logits/chosen": -0.921175479888916, "logits/rejected": -0.9453927278518677, "logps/chosen": -149.70677185058594, "logps/rejected": -91.14152526855469, "loss": 0.2102, "rewards/accuracies": 1.0, "rewards/chosen": 2.357774496078491, "rewards/margins": 1.4122917652130127, "rewards/rejected": 0.9454826712608337, "step": 73 }, { "epoch": 0.02, "learning_rate": 2.720588235294118e-06, "logits/chosen": -0.3912021517753601, "logits/rejected": -0.1860632598400116, "logps/chosen": -81.55545806884766, "logps/rejected": -163.16061401367188, "loss": 1.5408, "rewards/accuracies": 0.0, "rewards/chosen": 1.3216629028320312, "rewards/margins": -0.5801528692245483, "rewards/rejected": 1.9018157720565796, "step": 74 }, { "epoch": 0.02, "learning_rate": 2.757352941176471e-06, "logits/chosen": -0.4962090253829956, "logits/rejected": -0.3695378303527832, "logps/chosen": -103.99185180664062, "logps/rejected": -121.9426498413086, "loss": 0.389, "rewards/accuracies": 1.0, "rewards/chosen": 1.9414275884628296, "rewards/margins": 0.594506025314331, "rewards/rejected": 1.3469215631484985, "step": 75 }, { "epoch": 0.02, "learning_rate": 2.7941176470588237e-06, "logits/chosen": -0.40943047404289246, "logits/rejected": -0.3855925500392914, "logps/chosen": -63.053897857666016, "logps/rejected": -69.8508071899414, "loss": 0.7017, "rewards/accuracies": 1.0, "rewards/chosen": 1.315934419631958, "rewards/margins": 0.36599087715148926, "rewards/rejected": 0.9499435424804688, "step": 76 }, { "epoch": 0.02, "learning_rate": 2.8308823529411766e-06, "logits/chosen": -0.6855487823486328, "logits/rejected": -0.6667219400405884, "logps/chosen": -76.66422271728516, "logps/rejected": -124.45982360839844, "loss": 0.937, "rewards/accuracies": 1.0, "rewards/chosen": 1.5142539739608765, "rewards/margins": 0.07264173030853271, "rewards/rejected": 1.4416122436523438, "step": 77 }, { "epoch": 0.02, "learning_rate": 2.8676470588235296e-06, "logits/chosen": -0.10648231208324432, "logits/rejected": -0.06074801832437515, "logps/chosen": -106.5823974609375, "logps/rejected": -64.18620300292969, "loss": 0.638, "rewards/accuracies": 0.0, "rewards/chosen": 1.2146027088165283, "rewards/margins": -0.08766782283782959, "rewards/rejected": 1.302270531654358, "step": 78 }, { "epoch": 0.02, "learning_rate": 2.904411764705883e-06, "logits/chosen": -0.476207971572876, "logits/rejected": -0.4080403447151184, "logps/chosen": -76.90229797363281, "logps/rejected": -64.63249206542969, "loss": 0.5485, "rewards/accuracies": 1.0, "rewards/chosen": 2.1426796913146973, "rewards/margins": 0.7574989795684814, "rewards/rejected": 1.3851807117462158, "step": 79 }, { "epoch": 0.02, "learning_rate": 2.9411764705882355e-06, "logits/chosen": -0.35797804594039917, "logits/rejected": -0.4893401861190796, "logps/chosen": -202.76873779296875, "logps/rejected": -162.02850341796875, "loss": 0.6475, "rewards/accuracies": 1.0, "rewards/chosen": 2.545501708984375, "rewards/margins": 0.8830230236053467, "rewards/rejected": 1.6624786853790283, "step": 80 }, { "epoch": 0.02, "learning_rate": 2.9779411764705884e-06, "logits/chosen": -0.24171914160251617, "logits/rejected": -0.21256430447101593, "logps/chosen": -81.48949432373047, "logps/rejected": -107.9046401977539, "loss": 0.6999, "rewards/accuracies": 0.0, "rewards/chosen": 1.139074683189392, "rewards/margins": -0.6043343544006348, "rewards/rejected": 1.7434090375900269, "step": 81 }, { "epoch": 0.02, "learning_rate": 3.0147058823529413e-06, "logits/chosen": -0.3455357849597931, "logits/rejected": -0.4042806625366211, "logps/chosen": -92.20388793945312, "logps/rejected": -84.47290802001953, "loss": 1.059, "rewards/accuracies": 0.0, "rewards/chosen": 0.886401355266571, "rewards/margins": -0.3503701090812683, "rewards/rejected": 1.2367714643478394, "step": 82 }, { "epoch": 0.02, "learning_rate": 3.0514705882352947e-06, "logits/chosen": -0.2878403663635254, "logits/rejected": -0.2823440432548523, "logps/chosen": -122.4408950805664, "logps/rejected": -184.3303680419922, "loss": 0.7275, "rewards/accuracies": 0.0, "rewards/chosen": 2.4207725524902344, "rewards/margins": -0.7708795070648193, "rewards/rejected": 3.1916520595550537, "step": 83 }, { "epoch": 0.02, "learning_rate": 3.0882352941176476e-06, "logits/chosen": -0.5114223957061768, "logits/rejected": -0.6252303719520569, "logps/chosen": -144.74658203125, "logps/rejected": -162.64637756347656, "loss": 0.9218, "rewards/accuracies": 1.0, "rewards/chosen": 2.277806043624878, "rewards/margins": 0.01148366928100586, "rewards/rejected": 2.266322374343872, "step": 84 }, { "epoch": 0.02, "learning_rate": 3.125e-06, "logits/chosen": -0.33935490250587463, "logits/rejected": -0.11722220480442047, "logps/chosen": -98.66422271728516, "logps/rejected": -96.22251892089844, "loss": 0.8756, "rewards/accuracies": 0.0, "rewards/chosen": 2.008740186691284, "rewards/margins": -0.6086418628692627, "rewards/rejected": 2.617382049560547, "step": 85 }, { "epoch": 0.02, "learning_rate": 3.161764705882353e-06, "logits/chosen": -0.3225340247154236, "logits/rejected": -0.23325523734092712, "logps/chosen": -60.88788604736328, "logps/rejected": -45.071041107177734, "loss": 0.7836, "rewards/accuracies": 1.0, "rewards/chosen": 0.6465004086494446, "rewards/margins": 0.026538491249084473, "rewards/rejected": 0.6199619174003601, "step": 86 }, { "epoch": 0.02, "learning_rate": 3.198529411764706e-06, "logits/chosen": -0.6678804755210876, "logits/rejected": -0.6807883977890015, "logps/chosen": -111.65872955322266, "logps/rejected": -79.91996002197266, "loss": 0.6953, "rewards/accuracies": 1.0, "rewards/chosen": 1.5017632246017456, "rewards/margins": 0.0380401611328125, "rewards/rejected": 1.463723063468933, "step": 87 }, { "epoch": 0.02, "learning_rate": 3.2352941176470594e-06, "logits/chosen": -0.12972542643547058, "logits/rejected": -0.1443905532360077, "logps/chosen": -93.61936950683594, "logps/rejected": -86.35736083984375, "loss": 0.4686, "rewards/accuracies": 0.0, "rewards/chosen": 0.9228324890136719, "rewards/margins": -0.15556716918945312, "rewards/rejected": 1.078399658203125, "step": 88 }, { "epoch": 0.02, "learning_rate": 3.272058823529412e-06, "logits/chosen": -0.46903640031814575, "logits/rejected": -0.3741898536682129, "logps/chosen": -140.66220092773438, "logps/rejected": -25.836994171142578, "loss": 0.3602, "rewards/accuracies": 1.0, "rewards/chosen": 3.6455841064453125, "rewards/margins": 3.5893867015838623, "rewards/rejected": 0.056197356432676315, "step": 89 }, { "epoch": 0.02, "learning_rate": 3.308823529411765e-06, "logits/chosen": -0.4054967164993286, "logits/rejected": -0.3912334144115448, "logps/chosen": -92.22698211669922, "logps/rejected": -17.212905883789062, "loss": 0.4181, "rewards/accuracies": 1.0, "rewards/chosen": 1.3474282026290894, "rewards/margins": 0.48838889598846436, "rewards/rejected": 0.859039306640625, "step": 90 }, { "epoch": 0.02, "learning_rate": 3.3455882352941178e-06, "logits/chosen": -0.6143524646759033, "logits/rejected": -0.6359739899635315, "logps/chosen": -67.70440673828125, "logps/rejected": -78.09623718261719, "loss": 0.4414, "rewards/accuracies": 1.0, "rewards/chosen": 1.692583441734314, "rewards/margins": 0.016925811767578125, "rewards/rejected": 1.6756576299667358, "step": 91 }, { "epoch": 0.02, "learning_rate": 3.382352941176471e-06, "logits/chosen": -0.2638254165649414, "logits/rejected": -0.17641621828079224, "logps/chosen": -111.97864532470703, "logps/rejected": -88.31395721435547, "loss": 0.4889, "rewards/accuracies": 1.0, "rewards/chosen": 2.9472815990448, "rewards/margins": 1.2215605974197388, "rewards/rejected": 1.725721001625061, "step": 92 }, { "epoch": 0.02, "learning_rate": 3.419117647058824e-06, "logits/chosen": -0.3346831798553467, "logits/rejected": -0.35675686597824097, "logps/chosen": -73.27828979492188, "logps/rejected": -66.30513000488281, "loss": 0.8261, "rewards/accuracies": 0.0, "rewards/chosen": 0.5028221011161804, "rewards/margins": -1.2021057605743408, "rewards/rejected": 1.7049278020858765, "step": 93 }, { "epoch": 0.02, "learning_rate": 3.4558823529411766e-06, "logits/chosen": -0.46060842275619507, "logits/rejected": -0.5011611580848694, "logps/chosen": -89.52571105957031, "logps/rejected": -67.62863159179688, "loss": 0.4367, "rewards/accuracies": 0.0, "rewards/chosen": 1.261189341545105, "rewards/margins": -0.11252129077911377, "rewards/rejected": 1.3737106323242188, "step": 94 }, { "epoch": 0.02, "learning_rate": 3.4926470588235295e-06, "logits/chosen": -0.5829344987869263, "logits/rejected": -0.5297641754150391, "logps/chosen": -70.58326721191406, "logps/rejected": -50.49412536621094, "loss": 0.5862, "rewards/accuracies": 1.0, "rewards/chosen": 1.2031387090682983, "rewards/margins": 0.26198500394821167, "rewards/rejected": 0.9411537051200867, "step": 95 }, { "epoch": 0.02, "learning_rate": 3.529411764705883e-06, "logits/chosen": -0.6389676928520203, "logits/rejected": -0.4578593373298645, "logps/chosen": -158.59732055664062, "logps/rejected": -82.9982681274414, "loss": 0.4933, "rewards/accuracies": 1.0, "rewards/chosen": 2.394561767578125, "rewards/margins": 0.08898544311523438, "rewards/rejected": 2.3055763244628906, "step": 96 }, { "epoch": 0.02, "learning_rate": 3.566176470588236e-06, "logits/chosen": -0.6663381457328796, "logits/rejected": -0.5442354083061218, "logps/chosen": -84.6453857421875, "logps/rejected": -155.54745483398438, "loss": 1.6263, "rewards/accuracies": 0.0, "rewards/chosen": 2.2031800746917725, "rewards/margins": -1.894871473312378, "rewards/rejected": 4.09805154800415, "step": 97 }, { "epoch": 0.02, "learning_rate": 3.6029411764705883e-06, "logits/chosen": -0.42875078320503235, "logits/rejected": -0.3875519931316376, "logps/chosen": -72.78341674804688, "logps/rejected": -83.15676879882812, "loss": 0.671, "rewards/accuracies": 0.0, "rewards/chosen": 2.163290500640869, "rewards/margins": -0.1413109302520752, "rewards/rejected": 2.3046014308929443, "step": 98 }, { "epoch": 0.02, "learning_rate": 3.6397058823529413e-06, "logits/chosen": -0.549003541469574, "logits/rejected": -0.584266185760498, "logps/chosen": -177.11166381835938, "logps/rejected": -165.6490936279297, "loss": 0.9168, "rewards/accuracies": 0.0, "rewards/chosen": 1.971826195716858, "rewards/margins": -1.5511764287948608, "rewards/rejected": 3.5230026245117188, "step": 99 }, { "epoch": 0.02, "learning_rate": 3.6764705882352946e-06, "logits/chosen": -0.6615611910820007, "logits/rejected": -0.4835680425167084, "logps/chosen": -177.16567993164062, "logps/rejected": -33.59101104736328, "loss": 0.6304, "rewards/accuracies": 1.0, "rewards/chosen": 3.4119598865509033, "rewards/margins": 3.155858278274536, "rewards/rejected": 0.2561016082763672, "step": 100 }, { "epoch": 0.02, "learning_rate": 3.7132352941176476e-06, "logits/chosen": -0.3657062351703644, "logits/rejected": -0.27725476026535034, "logps/chosen": -67.97969055175781, "logps/rejected": -20.16315460205078, "loss": 0.8985, "rewards/accuracies": 1.0, "rewards/chosen": 1.0373398065567017, "rewards/margins": 0.8675177097320557, "rewards/rejected": 0.16982212662696838, "step": 101 }, { "epoch": 0.02, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -0.7041209936141968, "logits/rejected": -0.6690388917922974, "logps/chosen": -156.17507934570312, "logps/rejected": -103.98674774169922, "loss": 1.1633, "rewards/accuracies": 0.0, "rewards/chosen": 2.5798721313476562, "rewards/margins": -0.7820396423339844, "rewards/rejected": 3.3619117736816406, "step": 102 }, { "epoch": 0.02, "learning_rate": 3.786764705882353e-06, "logits/chosen": -0.6813992261886597, "logits/rejected": -0.6564366817474365, "logps/chosen": -244.71185302734375, "logps/rejected": -80.40644836425781, "loss": 0.3104, "rewards/accuracies": 1.0, "rewards/chosen": 2.953320264816284, "rewards/margins": 1.0033293962478638, "rewards/rejected": 1.9499908685684204, "step": 103 }, { "epoch": 0.02, "learning_rate": 3.8235294117647055e-06, "logits/chosen": -0.7886402606964111, "logits/rejected": -0.750103771686554, "logps/chosen": -68.71652221679688, "logps/rejected": -24.733304977416992, "loss": 0.154, "rewards/accuracies": 1.0, "rewards/chosen": 2.2066566944122314, "rewards/margins": 1.6007115840911865, "rewards/rejected": 0.6059450507164001, "step": 104 }, { "epoch": 0.02, "learning_rate": 3.860294117647059e-06, "logits/chosen": -0.41232216358184814, "logits/rejected": -0.2690276801586151, "logps/chosen": -89.4222412109375, "logps/rejected": -84.45606994628906, "loss": 0.3883, "rewards/accuracies": 1.0, "rewards/chosen": 3.4263198375701904, "rewards/margins": 0.8122107982635498, "rewards/rejected": 2.6141090393066406, "step": 105 }, { "epoch": 0.02, "learning_rate": 3.897058823529412e-06, "logits/chosen": -0.5738980770111084, "logits/rejected": -0.5338523387908936, "logps/chosen": -52.887229919433594, "logps/rejected": -78.04651641845703, "loss": 0.8849, "rewards/accuracies": 0.0, "rewards/chosen": 0.872003972530365, "rewards/margins": -0.3206023573875427, "rewards/rejected": 1.1926063299179077, "step": 106 }, { "epoch": 0.02, "learning_rate": 3.933823529411765e-06, "logits/chosen": -0.5466158390045166, "logits/rejected": -0.4946848750114441, "logps/chosen": -83.04072570800781, "logps/rejected": -81.77764892578125, "loss": 0.3793, "rewards/accuracies": 1.0, "rewards/chosen": 2.1560776233673096, "rewards/margins": 0.5202362537384033, "rewards/rejected": 1.6358413696289062, "step": 107 }, { "epoch": 0.02, "learning_rate": 3.970588235294118e-06, "logits/chosen": -0.7486171126365662, "logits/rejected": -0.6774508357048035, "logps/chosen": -134.60635375976562, "logps/rejected": -95.093505859375, "loss": 0.4911, "rewards/accuracies": 1.0, "rewards/chosen": 4.039048671722412, "rewards/margins": 2.6893506050109863, "rewards/rejected": 1.3496979475021362, "step": 108 }, { "epoch": 0.02, "learning_rate": 4.007352941176471e-06, "logits/chosen": -0.513096272945404, "logits/rejected": -0.42997562885284424, "logps/chosen": -38.903076171875, "logps/rejected": -63.41689682006836, "loss": 1.2611, "rewards/accuracies": 1.0, "rewards/chosen": 1.0280364751815796, "rewards/margins": 0.47953981161117554, "rewards/rejected": 0.548496663570404, "step": 109 }, { "epoch": 0.02, "learning_rate": 4.044117647058824e-06, "logits/chosen": -0.5926084518432617, "logits/rejected": -0.5784714818000793, "logps/chosen": -57.81879425048828, "logps/rejected": -42.769309997558594, "loss": 0.6849, "rewards/accuracies": 0.0, "rewards/chosen": 1.1802490949630737, "rewards/margins": -0.3937312364578247, "rewards/rejected": 1.5739803314208984, "step": 110 }, { "epoch": 0.02, "learning_rate": 4.080882352941177e-06, "logits/chosen": -0.716907262802124, "logits/rejected": -0.5387749671936035, "logps/chosen": -55.952674865722656, "logps/rejected": -89.12019348144531, "loss": 0.9299, "rewards/accuracies": 0.0, "rewards/chosen": 1.8875130414962769, "rewards/margins": -0.7936667203903198, "rewards/rejected": 2.6811797618865967, "step": 111 }, { "epoch": 0.02, "learning_rate": 4.11764705882353e-06, "logits/chosen": -0.6140555143356323, "logits/rejected": -0.5061612129211426, "logps/chosen": -52.39164352416992, "logps/rejected": -30.016822814941406, "loss": 0.5057, "rewards/accuracies": 1.0, "rewards/chosen": 0.8252063989639282, "rewards/margins": 0.20721971988677979, "rewards/rejected": 0.6179866790771484, "step": 112 }, { "epoch": 0.03, "learning_rate": 4.154411764705883e-06, "logits/chosen": -0.3268604576587677, "logits/rejected": -0.3115963339805603, "logps/chosen": -34.756507873535156, "logps/rejected": -70.08697509765625, "loss": 0.3307, "rewards/accuracies": 1.0, "rewards/chosen": 1.274605631828308, "rewards/margins": 0.15545892715454102, "rewards/rejected": 1.119146704673767, "step": 113 }, { "epoch": 0.03, "learning_rate": 4.191176470588236e-06, "logits/chosen": -0.5700930953025818, "logits/rejected": -0.5223700404167175, "logps/chosen": -80.11896514892578, "logps/rejected": -69.37350463867188, "loss": 0.2891, "rewards/accuracies": 1.0, "rewards/chosen": 1.9310531616210938, "rewards/margins": 1.001194715499878, "rewards/rejected": 0.929858386516571, "step": 114 }, { "epoch": 0.03, "learning_rate": 4.227941176470589e-06, "logits/chosen": -0.30075451731681824, "logits/rejected": -0.21533675491809845, "logps/chosen": -50.869606018066406, "logps/rejected": -14.883694648742676, "loss": 0.1901, "rewards/accuracies": 1.0, "rewards/chosen": 1.9251762628555298, "rewards/margins": 1.6439976692199707, "rewards/rejected": 0.2811785638332367, "step": 115 }, { "epoch": 0.03, "learning_rate": 4.264705882352942e-06, "logits/chosen": -0.4577484428882599, "logits/rejected": -0.45012542605400085, "logps/chosen": -97.45036315917969, "logps/rejected": -215.14065551757812, "loss": 2.0771, "rewards/accuracies": 0.0, "rewards/chosen": 1.5535567998886108, "rewards/margins": -2.7595624923706055, "rewards/rejected": 4.313119411468506, "step": 116 }, { "epoch": 0.03, "learning_rate": 4.301470588235295e-06, "logits/chosen": -0.5398274660110474, "logits/rejected": -0.46723616123199463, "logps/chosen": -161.46652221679688, "logps/rejected": -193.239501953125, "loss": 0.4216, "rewards/accuracies": 1.0, "rewards/chosen": 3.5582902431488037, "rewards/margins": 1.9097551107406616, "rewards/rejected": 1.648535132408142, "step": 117 }, { "epoch": 0.03, "learning_rate": 4.3382352941176475e-06, "logits/chosen": -0.7026846408843994, "logits/rejected": -0.7473633289337158, "logps/chosen": -25.034088134765625, "logps/rejected": -62.82462692260742, "loss": 0.8294, "rewards/accuracies": 0.0, "rewards/chosen": 1.6696220636367798, "rewards/margins": -0.24838757514953613, "rewards/rejected": 1.918009638786316, "step": 118 }, { "epoch": 0.03, "learning_rate": 4.3750000000000005e-06, "logits/chosen": -0.6795719265937805, "logits/rejected": -0.6899498701095581, "logps/chosen": -99.96065521240234, "logps/rejected": -104.72663879394531, "loss": 1.039, "rewards/accuracies": 0.0, "rewards/chosen": 1.6225334405899048, "rewards/margins": -1.5842751264572144, "rewards/rejected": 3.206808567047119, "step": 119 }, { "epoch": 0.03, "learning_rate": 4.411764705882353e-06, "logits/chosen": -0.4386045038700104, "logits/rejected": -0.41461145877838135, "logps/chosen": -114.27115631103516, "logps/rejected": -71.71290588378906, "loss": 0.5402, "rewards/accuracies": 0.0, "rewards/chosen": 1.377275824546814, "rewards/margins": -0.4697197675704956, "rewards/rejected": 1.8469955921173096, "step": 120 }, { "epoch": 0.03, "learning_rate": 4.448529411764706e-06, "logits/chosen": -0.5227380394935608, "logits/rejected": -0.4493204355239868, "logps/chosen": -139.78219604492188, "logps/rejected": -108.68849182128906, "loss": 0.3436, "rewards/accuracies": 1.0, "rewards/chosen": 3.148874044418335, "rewards/margins": 0.8941330909729004, "rewards/rejected": 2.2547409534454346, "step": 121 }, { "epoch": 0.03, "learning_rate": 4.485294117647059e-06, "logits/chosen": -0.4907597601413727, "logits/rejected": -0.43348339200019836, "logps/chosen": -54.393760681152344, "logps/rejected": -120.42124938964844, "loss": 0.7657, "rewards/accuracies": 0.0, "rewards/chosen": 2.1537697315216064, "rewards/margins": -1.2278847694396973, "rewards/rejected": 3.3816545009613037, "step": 122 }, { "epoch": 0.03, "learning_rate": 4.522058823529412e-06, "logits/chosen": -0.7014721632003784, "logits/rejected": -0.6662359237670898, "logps/chosen": -64.05550384521484, "logps/rejected": -66.91175842285156, "loss": 1.4795, "rewards/accuracies": 1.0, "rewards/chosen": 1.507947564125061, "rewards/margins": 0.5378257632255554, "rewards/rejected": 0.9701218008995056, "step": 123 }, { "epoch": 0.03, "learning_rate": 4.558823529411765e-06, "logits/chosen": -0.598091185092926, "logits/rejected": -0.598091185092926, "logps/chosen": -49.676918029785156, "logps/rejected": -49.676918029785156, "loss": 0.8486, "rewards/accuracies": 0.0, "rewards/chosen": 2.174614667892456, "rewards/margins": 0.0, "rewards/rejected": 2.174614667892456, "step": 124 }, { "epoch": 0.03, "learning_rate": 4.595588235294118e-06, "logits/chosen": -0.7332620620727539, "logits/rejected": -0.5599564909934998, "logps/chosen": -88.5316390991211, "logps/rejected": -117.06914520263672, "loss": 0.7763, "rewards/accuracies": 0.0, "rewards/chosen": 2.361293077468872, "rewards/margins": -0.37323689460754395, "rewards/rejected": 2.734529972076416, "step": 125 }, { "epoch": 0.03, "learning_rate": 4.632352941176471e-06, "logits/chosen": -0.4775194525718689, "logits/rejected": -0.4227680265903473, "logps/chosen": -109.29251098632812, "logps/rejected": -15.68498420715332, "loss": 0.3645, "rewards/accuracies": 1.0, "rewards/chosen": 1.1484253406524658, "rewards/margins": 0.5731199383735657, "rewards/rejected": 0.5753054022789001, "step": 126 }, { "epoch": 0.03, "learning_rate": 4.669117647058824e-06, "logits/chosen": -0.5459548830986023, "logits/rejected": -0.571084201335907, "logps/chosen": -73.76972961425781, "logps/rejected": -95.59834289550781, "loss": 1.345, "rewards/accuracies": 0.0, "rewards/chosen": 0.8477020263671875, "rewards/margins": -2.5713212490081787, "rewards/rejected": 3.419023275375366, "step": 127 }, { "epoch": 0.03, "learning_rate": 4.705882352941177e-06, "logits/chosen": -0.48132583498954773, "logits/rejected": -0.48132583498954773, "logps/chosen": -49.89375686645508, "logps/rejected": -49.89375686645508, "loss": 0.6915, "rewards/accuracies": 0.0, "rewards/chosen": 1.4846858978271484, "rewards/margins": 0.0, "rewards/rejected": 1.4846858978271484, "step": 128 }, { "epoch": 0.03, "learning_rate": 4.74264705882353e-06, "logits/chosen": -0.24863383173942566, "logits/rejected": -0.14389027655124664, "logps/chosen": -47.77025604248047, "logps/rejected": -35.727622985839844, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 0.9816368222236633, "rewards/margins": 1.1727931499481201, "rewards/rejected": -0.19115638732910156, "step": 129 }, { "epoch": 0.03, "learning_rate": 4.779411764705883e-06, "logits/chosen": -0.5481343269348145, "logits/rejected": -0.4764658808708191, "logps/chosen": -44.28479766845703, "logps/rejected": -42.85221481323242, "loss": 0.3336, "rewards/accuracies": 1.0, "rewards/chosen": 2.1612579822540283, "rewards/margins": 1.1655793190002441, "rewards/rejected": 0.995678722858429, "step": 130 }, { "epoch": 0.03, "learning_rate": 4.816176470588236e-06, "logits/chosen": -0.7194216251373291, "logits/rejected": -0.7194216251373291, "logps/chosen": -43.46687316894531, "logps/rejected": -43.46687316894531, "loss": 0.8711, "rewards/accuracies": 0.0, "rewards/chosen": 1.5432995557785034, "rewards/margins": 0.0, "rewards/rejected": 1.5432995557785034, "step": 131 }, { "epoch": 0.03, "learning_rate": 4.852941176470589e-06, "logits/chosen": -0.42001327872276306, "logits/rejected": -0.5084734559059143, "logps/chosen": -53.27666473388672, "logps/rejected": -128.4354248046875, "loss": 2.2195, "rewards/accuracies": 0.0, "rewards/chosen": 1.4508880376815796, "rewards/margins": -3.8585386276245117, "rewards/rejected": 5.309426784515381, "step": 132 }, { "epoch": 0.03, "learning_rate": 4.889705882352942e-06, "logits/chosen": -0.8675550222396851, "logits/rejected": -0.19820886850357056, "logps/chosen": -54.14980697631836, "logps/rejected": -70.571533203125, "loss": 1.4705, "rewards/accuracies": 0.0, "rewards/chosen": 2.1551716327667236, "rewards/margins": -1.811378002166748, "rewards/rejected": 3.9665496349334717, "step": 133 }, { "epoch": 0.03, "learning_rate": 4.9264705882352945e-06, "logits/chosen": -0.3345029354095459, "logits/rejected": -0.3257172107696533, "logps/chosen": -60.72698974609375, "logps/rejected": -87.21429443359375, "loss": 2.2384, "rewards/accuracies": 0.0, "rewards/chosen": 1.112951636314392, "rewards/margins": -3.1313962936401367, "rewards/rejected": 4.244348049163818, "step": 134 }, { "epoch": 0.03, "learning_rate": 4.9632352941176475e-06, "logits/chosen": -0.903695821762085, "logits/rejected": -0.8355706334114075, "logps/chosen": -153.33786010742188, "logps/rejected": -76.28500366210938, "loss": 0.2694, "rewards/accuracies": 1.0, "rewards/chosen": 3.076267957687378, "rewards/margins": 1.6144652366638184, "rewards/rejected": 1.4618027210235596, "step": 135 }, { "epoch": 0.03, "learning_rate": 5e-06, "logits/chosen": -1.045229196548462, "logits/rejected": -0.9013482928276062, "logps/chosen": -103.50711059570312, "logps/rejected": -125.7689208984375, "loss": 0.9023, "rewards/accuracies": 1.0, "rewards/chosen": 5.477621555328369, "rewards/margins": 0.22420787811279297, "rewards/rejected": 5.253413677215576, "step": 136 }, { "epoch": 0.03, "learning_rate": 5.036764705882353e-06, "logits/chosen": -0.4962121546268463, "logits/rejected": -0.654007077217102, "logps/chosen": -87.51129150390625, "logps/rejected": -130.14584350585938, "loss": 2.2329, "rewards/accuracies": 0.0, "rewards/chosen": 2.5020203590393066, "rewards/margins": -3.8425064086914062, "rewards/rejected": 6.344526767730713, "step": 137 }, { "epoch": 0.03, "learning_rate": 5.073529411764706e-06, "logits/chosen": -0.9250233769416809, "logits/rejected": -0.7370604276657104, "logps/chosen": -142.7489013671875, "logps/rejected": -50.08734130859375, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": 3.8929200172424316, "rewards/margins": 3.702590227127075, "rewards/rejected": 0.19032974541187286, "step": 138 }, { "epoch": 0.03, "learning_rate": 5.110294117647059e-06, "logits/chosen": -0.4547399580478668, "logits/rejected": -0.450466126203537, "logps/chosen": -60.49848175048828, "logps/rejected": -58.65495300292969, "loss": 0.8079, "rewards/accuracies": 0.0, "rewards/chosen": 1.6584281921386719, "rewards/margins": -1.1187317371368408, "rewards/rejected": 2.7771599292755127, "step": 139 }, { "epoch": 0.03, "learning_rate": 5.147058823529411e-06, "logits/chosen": -0.6866167783737183, "logits/rejected": -0.6807684898376465, "logps/chosen": -136.944091796875, "logps/rejected": -92.42593383789062, "loss": 0.9414, "rewards/accuracies": 0.0, "rewards/chosen": 3.276419162750244, "rewards/margins": -1.5863738059997559, "rewards/rejected": 4.86279296875, "step": 140 }, { "epoch": 0.03, "learning_rate": 5.183823529411766e-06, "logits/chosen": -0.9084147810935974, "logits/rejected": -0.8286832571029663, "logps/chosen": -136.31259155273438, "logps/rejected": -90.84861755371094, "loss": 1.4322, "rewards/accuracies": 1.0, "rewards/chosen": 3.5785889625549316, "rewards/margins": 0.7854592800140381, "rewards/rejected": 2.7931296825408936, "step": 141 }, { "epoch": 0.03, "learning_rate": 5.220588235294118e-06, "logits/chosen": -0.19997411966323853, "logits/rejected": -0.18408387899398804, "logps/chosen": -47.000423431396484, "logps/rejected": -51.4423828125, "loss": 2.2709, "rewards/accuracies": 1.0, "rewards/chosen": 2.2997701168060303, "rewards/margins": 1.1265023946762085, "rewards/rejected": 1.1732677221298218, "step": 142 }, { "epoch": 0.03, "learning_rate": 5.257352941176471e-06, "logits/chosen": -0.884845495223999, "logits/rejected": -0.8572573065757751, "logps/chosen": -28.186058044433594, "logps/rejected": -40.53385925292969, "loss": 0.3966, "rewards/accuracies": 0.0, "rewards/chosen": 1.2588173151016235, "rewards/margins": -0.18807530403137207, "rewards/rejected": 1.4468926191329956, "step": 143 }, { "epoch": 0.03, "learning_rate": 5.294117647058824e-06, "logits/chosen": -0.3936195373535156, "logits/rejected": -0.1584724634885788, "logps/chosen": -47.27491760253906, "logps/rejected": -48.05692672729492, "loss": 0.1786, "rewards/accuracies": 1.0, "rewards/chosen": 2.2247331142425537, "rewards/margins": 1.081969976425171, "rewards/rejected": 1.1427631378173828, "step": 144 }, { "epoch": 0.03, "learning_rate": 5.330882352941177e-06, "logits/chosen": -0.5963045358657837, "logits/rejected": -0.5963045358657837, "logps/chosen": -102.62738037109375, "logps/rejected": -102.62738037109375, "loss": 0.8511, "rewards/accuracies": 0.0, "rewards/chosen": 1.1485503911972046, "rewards/margins": 0.0, "rewards/rejected": 1.1485503911972046, "step": 145 }, { "epoch": 0.03, "learning_rate": 5.36764705882353e-06, "logits/chosen": -0.5211145877838135, "logits/rejected": -0.33798861503601074, "logps/chosen": -91.02287292480469, "logps/rejected": -84.18330383300781, "loss": 0.3533, "rewards/accuracies": 1.0, "rewards/chosen": 1.3599640130996704, "rewards/margins": 0.13498377799987793, "rewards/rejected": 1.2249802350997925, "step": 146 }, { "epoch": 0.03, "learning_rate": 5.404411764705883e-06, "logits/chosen": -0.5900718569755554, "logits/rejected": -0.5377398729324341, "logps/chosen": -56.80762481689453, "logps/rejected": -82.95960998535156, "loss": 0.293, "rewards/accuracies": 1.0, "rewards/chosen": 2.7697930335998535, "rewards/margins": 0.3611290454864502, "rewards/rejected": 2.4086639881134033, "step": 147 }, { "epoch": 0.03, "learning_rate": 5.441176470588236e-06, "logits/chosen": -0.5838168859481812, "logits/rejected": -0.36008772253990173, "logps/chosen": -221.93927001953125, "logps/rejected": -28.379249572753906, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 4.285864353179932, "rewards/margins": 3.8371543884277344, "rewards/rejected": 0.4487098753452301, "step": 148 }, { "epoch": 0.03, "learning_rate": 5.4779411764705894e-06, "logits/chosen": -0.646426260471344, "logits/rejected": -0.4842749238014221, "logps/chosen": -140.8129119873047, "logps/rejected": -81.99022674560547, "loss": 0.3856, "rewards/accuracies": 1.0, "rewards/chosen": 4.751094341278076, "rewards/margins": 2.9368975162506104, "rewards/rejected": 1.8141968250274658, "step": 149 }, { "epoch": 0.03, "learning_rate": 5.514705882352942e-06, "logits/chosen": -0.6215337514877319, "logits/rejected": -0.6080360412597656, "logps/chosen": -53.81296920776367, "logps/rejected": -86.99638366699219, "loss": 1.7238, "rewards/accuracies": 0.0, "rewards/chosen": 2.373279333114624, "rewards/margins": -2.2902047634124756, "rewards/rejected": 4.6634840965271, "step": 150 }, { "epoch": 0.03, "learning_rate": 5.5514705882352945e-06, "logits/chosen": -0.802050769329071, "logits/rejected": -0.7324634790420532, "logps/chosen": -80.0787124633789, "logps/rejected": -12.865167617797852, "loss": 0.2154, "rewards/accuracies": 1.0, "rewards/chosen": 1.33172607421875, "rewards/margins": 0.958449125289917, "rewards/rejected": 0.3732769191265106, "step": 151 }, { "epoch": 0.03, "learning_rate": 5.588235294117647e-06, "logits/chosen": -0.5490809082984924, "logits/rejected": -0.5034664273262024, "logps/chosen": -42.608150482177734, "logps/rejected": -64.87682342529297, "loss": 0.7605, "rewards/accuracies": 0.0, "rewards/chosen": 1.460700273513794, "rewards/margins": -0.44672393798828125, "rewards/rejected": 1.9074242115020752, "step": 152 }, { "epoch": 0.03, "learning_rate": 5.625e-06, "logits/chosen": -0.27759504318237305, "logits/rejected": 0.15208452939987183, "logps/chosen": -78.26759338378906, "logps/rejected": -138.5025177001953, "loss": 0.4779, "rewards/accuracies": 0.0, "rewards/chosen": 2.109431505203247, "rewards/margins": -0.45989084243774414, "rewards/rejected": 2.569322347640991, "step": 153 }, { "epoch": 0.03, "learning_rate": 5.661764705882353e-06, "logits/chosen": -0.5746723413467407, "logits/rejected": -0.46006616950035095, "logps/chosen": -60.759429931640625, "logps/rejected": -107.40662384033203, "loss": 0.624, "rewards/accuracies": 1.0, "rewards/chosen": 3.246047258377075, "rewards/margins": 0.8320374488830566, "rewards/rejected": 2.4140098094940186, "step": 154 }, { "epoch": 0.03, "learning_rate": 5.698529411764706e-06, "logits/chosen": -0.3003200590610504, "logits/rejected": -0.2117212861776352, "logps/chosen": -61.426429748535156, "logps/rejected": -54.542236328125, "loss": 2.2557, "rewards/accuracies": 0.0, "rewards/chosen": 1.2701225280761719, "rewards/margins": -1.3360068798065186, "rewards/rejected": 2.6061294078826904, "step": 155 }, { "epoch": 0.03, "learning_rate": 5.735294117647059e-06, "logits/chosen": -0.6129115223884583, "logits/rejected": -0.6314488053321838, "logps/chosen": -164.9375, "logps/rejected": -76.13754272460938, "loss": 1.1204, "rewards/accuracies": 0.0, "rewards/chosen": 3.8838348388671875, "rewards/margins": -1.6650652885437012, "rewards/rejected": 5.548900127410889, "step": 156 }, { "epoch": 0.03, "learning_rate": 5.772058823529412e-06, "logits/chosen": -1.238273024559021, "logits/rejected": -1.0429017543792725, "logps/chosen": -124.85382080078125, "logps/rejected": -75.84009552001953, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": 6.2672119140625, "rewards/margins": 2.3334007263183594, "rewards/rejected": 3.9338111877441406, "step": 157 }, { "epoch": 0.03, "learning_rate": 5.808823529411766e-06, "logits/chosen": -0.7627918720245361, "logits/rejected": -0.6995992064476013, "logps/chosen": -128.68902587890625, "logps/rejected": -55.49308776855469, "loss": 1.2452, "rewards/accuracies": 1.0, "rewards/chosen": 4.059048652648926, "rewards/margins": 2.3377299308776855, "rewards/rejected": 1.7213188409805298, "step": 158 }, { "epoch": 0.04, "learning_rate": 5.845588235294119e-06, "logits/chosen": -0.8373472690582275, "logits/rejected": -0.625196635723114, "logps/chosen": -125.4159164428711, "logps/rejected": -61.9443473815918, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": 4.933640480041504, "rewards/margins": 2.1734158992767334, "rewards/rejected": 2.7602245807647705, "step": 159 }, { "epoch": 0.04, "learning_rate": 5.882352941176471e-06, "logits/chosen": -0.7750552296638489, "logits/rejected": -0.6132495403289795, "logps/chosen": -154.97406005859375, "logps/rejected": -32.99696731567383, "loss": 0.2157, "rewards/accuracies": 1.0, "rewards/chosen": 3.796525716781616, "rewards/margins": 1.237222671508789, "rewards/rejected": 2.559303045272827, "step": 160 }, { "epoch": 0.04, "learning_rate": 5.919117647058824e-06, "logits/chosen": -1.0082402229309082, "logits/rejected": -0.879557728767395, "logps/chosen": -106.48715209960938, "logps/rejected": -39.092063903808594, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": 6.198327541351318, "rewards/margins": 6.119326591491699, "rewards/rejected": 0.07900085300207138, "step": 161 }, { "epoch": 0.04, "learning_rate": 5.955882352941177e-06, "logits/chosen": -0.731336236000061, "logits/rejected": -0.5780636668205261, "logps/chosen": -193.08306884765625, "logps/rejected": -81.40953063964844, "loss": 0.3962, "rewards/accuracies": 1.0, "rewards/chosen": 6.82073974609375, "rewards/margins": 3.980438232421875, "rewards/rejected": 2.840301513671875, "step": 162 }, { "epoch": 0.04, "learning_rate": 5.99264705882353e-06, "logits/chosen": -0.40765708684921265, "logits/rejected": -0.36200007796287537, "logps/chosen": -114.84722900390625, "logps/rejected": -44.778533935546875, "loss": 0.3239, "rewards/accuracies": 1.0, "rewards/chosen": 2.7497239112854004, "rewards/margins": 0.3882615566253662, "rewards/rejected": 2.361462354660034, "step": 163 }, { "epoch": 0.04, "learning_rate": 6.029411764705883e-06, "logits/chosen": -0.5285773277282715, "logits/rejected": -0.42259374260902405, "logps/chosen": -92.73481750488281, "logps/rejected": -80.91139221191406, "loss": 0.2312, "rewards/accuracies": 1.0, "rewards/chosen": 3.5657289028167725, "rewards/margins": 1.5422890186309814, "rewards/rejected": 2.023439884185791, "step": 164 }, { "epoch": 0.04, "learning_rate": 6.066176470588236e-06, "logits/chosen": -0.71641606092453, "logits/rejected": -0.664595365524292, "logps/chosen": -144.81210327148438, "logps/rejected": -111.34507751464844, "loss": 1.6131, "rewards/accuracies": 0.0, "rewards/chosen": 4.502325534820557, "rewards/margins": -3.1741957664489746, "rewards/rejected": 7.676521301269531, "step": 165 }, { "epoch": 0.04, "learning_rate": 6.102941176470589e-06, "logits/chosen": -0.6237502098083496, "logits/rejected": -0.6373482942581177, "logps/chosen": -141.1066131591797, "logps/rejected": -138.29718017578125, "loss": 1.0256, "rewards/accuracies": 0.0, "rewards/chosen": 2.735377550125122, "rewards/margins": -1.8730881214141846, "rewards/rejected": 4.608465671539307, "step": 166 }, { "epoch": 0.04, "learning_rate": 6.139705882352942e-06, "logits/chosen": -0.5499320030212402, "logits/rejected": -0.5499320030212402, "logps/chosen": -74.1896743774414, "logps/rejected": -74.1896743774414, "loss": 0.582, "rewards/accuracies": 0.0, "rewards/chosen": 1.1141036748886108, "rewards/margins": 0.0, "rewards/rejected": 1.1141036748886108, "step": 167 }, { "epoch": 0.04, "learning_rate": 6.176470588235295e-06, "logits/chosen": -1.0231425762176514, "logits/rejected": -0.9635111093521118, "logps/chosen": -43.484596252441406, "logps/rejected": -123.73495483398438, "loss": 1.7746, "rewards/accuracies": 0.0, "rewards/chosen": 3.1159508228302, "rewards/margins": -2.4518563747406006, "rewards/rejected": 5.567807197570801, "step": 168 }, { "epoch": 0.04, "learning_rate": 6.213235294117647e-06, "logits/chosen": -0.5073555111885071, "logits/rejected": -0.5073555111885071, "logps/chosen": -37.819400787353516, "logps/rejected": -37.819400787353516, "loss": 0.404, "rewards/accuracies": 0.0, "rewards/chosen": 1.7375980615615845, "rewards/margins": 0.0, "rewards/rejected": 1.7375980615615845, "step": 169 }, { "epoch": 0.04, "learning_rate": 6.25e-06, "logits/chosen": -0.7383387088775635, "logits/rejected": -0.6528916358947754, "logps/chosen": -142.264404296875, "logps/rejected": -176.0459747314453, "loss": 1.0891, "rewards/accuracies": 0.0, "rewards/chosen": 4.264873027801514, "rewards/margins": -2.0292372703552246, "rewards/rejected": 6.294110298156738, "step": 170 }, { "epoch": 0.04, "learning_rate": 6.286764705882353e-06, "logits/chosen": -0.7932865619659424, "logits/rejected": -0.7579681277275085, "logps/chosen": -120.183349609375, "logps/rejected": -87.63398742675781, "loss": 0.9882, "rewards/accuracies": 1.0, "rewards/chosen": 3.5936126708984375, "rewards/margins": 0.8754522800445557, "rewards/rejected": 2.718160390853882, "step": 171 }, { "epoch": 0.04, "learning_rate": 6.323529411764706e-06, "logits/chosen": -1.200767159461975, "logits/rejected": -1.0204532146453857, "logps/chosen": -128.20050048828125, "logps/rejected": -113.4544677734375, "loss": 0.7881, "rewards/accuracies": 1.0, "rewards/chosen": 5.309525966644287, "rewards/margins": 1.704742193222046, "rewards/rejected": 3.604783773422241, "step": 172 }, { "epoch": 0.04, "learning_rate": 6.360294117647059e-06, "logits/chosen": -1.0091502666473389, "logits/rejected": -0.88497394323349, "logps/chosen": -163.48585510253906, "logps/rejected": -109.40794372558594, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": 7.3521928787231445, "rewards/margins": 1.9686522483825684, "rewards/rejected": 5.383540630340576, "step": 173 }, { "epoch": 0.04, "learning_rate": 6.397058823529412e-06, "logits/chosen": -0.9208017587661743, "logits/rejected": -0.7974380254745483, "logps/chosen": -170.50062561035156, "logps/rejected": -68.84362030029297, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": 6.412245273590088, "rewards/margins": 3.361452579498291, "rewards/rejected": 3.050792694091797, "step": 174 }, { "epoch": 0.04, "learning_rate": 6.433823529411766e-06, "logits/chosen": -0.367426335811615, "logits/rejected": -0.32990553975105286, "logps/chosen": -37.93220520019531, "logps/rejected": -29.463237762451172, "loss": 0.2633, "rewards/accuracies": 1.0, "rewards/chosen": 2.005985975265503, "rewards/margins": 0.9400362968444824, "rewards/rejected": 1.0659496784210205, "step": 175 }, { "epoch": 0.04, "learning_rate": 6.470588235294119e-06, "logits/chosen": -0.34504982829093933, "logits/rejected": -0.47621065378189087, "logps/chosen": -72.65969848632812, "logps/rejected": -107.87306213378906, "loss": 2.2116, "rewards/accuracies": 0.0, "rewards/chosen": 2.8676040172576904, "rewards/margins": -2.26230788230896, "rewards/rejected": 5.12991189956665, "step": 176 }, { "epoch": 0.04, "learning_rate": 6.507352941176472e-06, "logits/chosen": -0.7383033633232117, "logits/rejected": -0.6646350622177124, "logps/chosen": -158.3054656982422, "logps/rejected": -91.03248596191406, "loss": 1.0481, "rewards/accuracies": 1.0, "rewards/chosen": 5.315385341644287, "rewards/margins": 2.1703460216522217, "rewards/rejected": 3.1450393199920654, "step": 177 }, { "epoch": 0.04, "learning_rate": 6.544117647058824e-06, "logits/chosen": -0.8410055637359619, "logits/rejected": -0.8176929354667664, "logps/chosen": -64.76878356933594, "logps/rejected": -53.96135330200195, "loss": 1.6336, "rewards/accuracies": 0.0, "rewards/chosen": 1.0756622552871704, "rewards/margins": -1.8435078859329224, "rewards/rejected": 2.9191701412200928, "step": 178 }, { "epoch": 0.04, "learning_rate": 6.580882352941177e-06, "logits/chosen": -0.29512980580329895, "logits/rejected": -0.2896556556224823, "logps/chosen": -61.846458435058594, "logps/rejected": -43.53293991088867, "loss": 1.4826, "rewards/accuracies": 0.0, "rewards/chosen": 2.743175506591797, "rewards/margins": -0.1142728328704834, "rewards/rejected": 2.8574483394622803, "step": 179 }, { "epoch": 0.04, "learning_rate": 6.61764705882353e-06, "logits/chosen": -0.7831121683120728, "logits/rejected": -0.7018291354179382, "logps/chosen": -75.33206176757812, "logps/rejected": -34.15512466430664, "loss": 0.3467, "rewards/accuracies": 1.0, "rewards/chosen": 1.9144271612167358, "rewards/margins": 1.5819350481033325, "rewards/rejected": 0.33249208331108093, "step": 180 }, { "epoch": 0.04, "learning_rate": 6.654411764705883e-06, "logits/chosen": -0.6230806708335876, "logits/rejected": -0.572253942489624, "logps/chosen": -47.31895446777344, "logps/rejected": -94.25144958496094, "loss": 0.4627, "rewards/accuracies": 0.0, "rewards/chosen": 1.3553017377853394, "rewards/margins": -0.3895270824432373, "rewards/rejected": 1.7448288202285767, "step": 181 }, { "epoch": 0.04, "learning_rate": 6.6911764705882356e-06, "logits/chosen": -0.7639177441596985, "logits/rejected": -0.774625301361084, "logps/chosen": -63.84900665283203, "logps/rejected": -76.05120849609375, "loss": 1.3986, "rewards/accuracies": 1.0, "rewards/chosen": 3.193837881088257, "rewards/margins": 1.4704492092132568, "rewards/rejected": 1.723388671875, "step": 182 }, { "epoch": 0.04, "learning_rate": 6.727941176470589e-06, "logits/chosen": -1.1239674091339111, "logits/rejected": -1.0522631406784058, "logps/chosen": -80.29515075683594, "logps/rejected": -84.22142028808594, "loss": 1.6077, "rewards/accuracies": 0.0, "rewards/chosen": 1.5789985656738281, "rewards/margins": -2.266652822494507, "rewards/rejected": 3.845651388168335, "step": 183 }, { "epoch": 0.04, "learning_rate": 6.764705882352942e-06, "logits/chosen": -0.905108630657196, "logits/rejected": -0.7050167918205261, "logps/chosen": -192.61831665039062, "logps/rejected": -116.65412902832031, "loss": 0.165, "rewards/accuracies": 1.0, "rewards/chosen": 5.2093963623046875, "rewards/margins": 1.5513136386871338, "rewards/rejected": 3.6580827236175537, "step": 184 }, { "epoch": 0.04, "learning_rate": 6.801470588235295e-06, "logits/chosen": -0.48664432764053345, "logits/rejected": -0.48664432764053345, "logps/chosen": -23.952503204345703, "logps/rejected": -23.952503204345703, "loss": 0.8682, "rewards/accuracies": 0.0, "rewards/chosen": 1.910776138305664, "rewards/margins": 0.0, "rewards/rejected": 1.910776138305664, "step": 185 }, { "epoch": 0.04, "learning_rate": 6.838235294117648e-06, "logits/chosen": -0.5968053936958313, "logits/rejected": -0.39441266655921936, "logps/chosen": -56.114051818847656, "logps/rejected": -44.99665069580078, "loss": 0.7507, "rewards/accuracies": 0.0, "rewards/chosen": 1.3135734796524048, "rewards/margins": -0.6892281770706177, "rewards/rejected": 2.0028016567230225, "step": 186 }, { "epoch": 0.04, "learning_rate": 6.875e-06, "logits/chosen": -0.5055091381072998, "logits/rejected": -0.3809961974620819, "logps/chosen": -55.800987243652344, "logps/rejected": -69.4870376586914, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": 3.7003815174102783, "rewards/margins": 2.6263389587402344, "rewards/rejected": 1.074042558670044, "step": 187 }, { "epoch": 0.04, "learning_rate": 6.911764705882353e-06, "logits/chosen": -0.6023845076560974, "logits/rejected": -0.47395196557044983, "logps/chosen": -67.00979614257812, "logps/rejected": -62.166404724121094, "loss": 0.2763, "rewards/accuracies": 1.0, "rewards/chosen": 1.9430595636367798, "rewards/margins": 0.6708236932754517, "rewards/rejected": 1.2722358703613281, "step": 188 }, { "epoch": 0.04, "learning_rate": 6.948529411764706e-06, "logits/chosen": -0.7280184626579285, "logits/rejected": -0.6766398549079895, "logps/chosen": -60.529685974121094, "logps/rejected": -55.431053161621094, "loss": 0.3328, "rewards/accuracies": 1.0, "rewards/chosen": 2.6854164600372314, "rewards/margins": 0.20932769775390625, "rewards/rejected": 2.476088762283325, "step": 189 }, { "epoch": 0.04, "learning_rate": 6.985294117647059e-06, "logits/chosen": -0.39481526613235474, "logits/rejected": -0.3549438416957855, "logps/chosen": -49.059898376464844, "logps/rejected": -39.0284309387207, "loss": 0.6116, "rewards/accuracies": 0.0, "rewards/chosen": 2.733945608139038, "rewards/margins": -0.7934558391571045, "rewards/rejected": 3.5274014472961426, "step": 190 }, { "epoch": 0.04, "learning_rate": 7.022058823529412e-06, "logits/chosen": -0.6516087651252747, "logits/rejected": -0.6260188817977905, "logps/chosen": -46.7285041809082, "logps/rejected": -42.54310607910156, "loss": 1.6761, "rewards/accuracies": 1.0, "rewards/chosen": 2.7027478218078613, "rewards/margins": 0.5694644451141357, "rewards/rejected": 2.1332833766937256, "step": 191 }, { "epoch": 0.04, "learning_rate": 7.058823529411766e-06, "logits/chosen": -0.533976137638092, "logits/rejected": -0.4348486661911011, "logps/chosen": -125.8293685913086, "logps/rejected": -56.33126449584961, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": 4.8177409172058105, "rewards/margins": 2.9870986938476562, "rewards/rejected": 1.8306423425674438, "step": 192 }, { "epoch": 0.04, "learning_rate": 7.095588235294119e-06, "logits/chosen": -0.5545377135276794, "logits/rejected": -0.4981273412704468, "logps/chosen": -60.68083953857422, "logps/rejected": -76.26862335205078, "loss": 0.4971, "rewards/accuracies": 0.0, "rewards/chosen": 1.8801621198654175, "rewards/margins": -0.14160311222076416, "rewards/rejected": 2.0217652320861816, "step": 193 }, { "epoch": 0.04, "learning_rate": 7.132352941176472e-06, "logits/chosen": -0.9545303583145142, "logits/rejected": -0.9391946196556091, "logps/chosen": -100.79317474365234, "logps/rejected": -164.46884155273438, "loss": 0.308, "rewards/accuracies": 1.0, "rewards/chosen": 5.926705360412598, "rewards/margins": 0.21317672729492188, "rewards/rejected": 5.713528633117676, "step": 194 }, { "epoch": 0.04, "learning_rate": 7.169117647058825e-06, "logits/chosen": -0.6027175784111023, "logits/rejected": -0.4590074419975281, "logps/chosen": -237.7637939453125, "logps/rejected": -27.487380981445312, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": 4.033807277679443, "rewards/margins": 3.645606756210327, "rewards/rejected": 0.388200581073761, "step": 195 }, { "epoch": 0.04, "learning_rate": 7.205882352941177e-06, "logits/chosen": -0.7247069478034973, "logits/rejected": -0.6928139925003052, "logps/chosen": -71.29470825195312, "logps/rejected": -96.02891540527344, "loss": 1.4495, "rewards/accuracies": 0.0, "rewards/chosen": 0.9844459891319275, "rewards/margins": -2.497202157974243, "rewards/rejected": 3.4816482067108154, "step": 196 }, { "epoch": 0.04, "learning_rate": 7.24264705882353e-06, "logits/chosen": -0.46613457798957825, "logits/rejected": -0.46613457798957825, "logps/chosen": -27.083398818969727, "logps/rejected": -27.083398818969727, "loss": 0.4884, "rewards/accuracies": 0.0, "rewards/chosen": 1.4107259511947632, "rewards/margins": 0.0, "rewards/rejected": 1.4107259511947632, "step": 197 }, { "epoch": 0.04, "learning_rate": 7.2794117647058826e-06, "logits/chosen": -0.477312833070755, "logits/rejected": -0.5218563079833984, "logps/chosen": -85.69023895263672, "logps/rejected": -181.49765014648438, "loss": 1.7755, "rewards/accuracies": 0.0, "rewards/chosen": 2.8891327381134033, "rewards/margins": -3.402029275894165, "rewards/rejected": 6.291162014007568, "step": 198 }, { "epoch": 0.04, "learning_rate": 7.3161764705882355e-06, "logits/chosen": -0.5996144413948059, "logits/rejected": -0.5215245485305786, "logps/chosen": -58.18327331542969, "logps/rejected": -98.82325744628906, "loss": 0.8313, "rewards/accuracies": 1.0, "rewards/chosen": 2.648258924484253, "rewards/margins": 0.5638601779937744, "rewards/rejected": 2.0843987464904785, "step": 199 }, { "epoch": 0.04, "learning_rate": 7.352941176470589e-06, "logits/chosen": -0.5003214478492737, "logits/rejected": -0.3825049102306366, "logps/chosen": -41.06975555419922, "logps/rejected": -73.66938781738281, "loss": 0.8315, "rewards/accuracies": 0.0, "rewards/chosen": 2.2828705310821533, "rewards/margins": -0.11379623413085938, "rewards/rejected": 2.3966667652130127, "step": 200 }, { "epoch": 0.04, "learning_rate": 7.389705882352942e-06, "logits/chosen": -0.5776367783546448, "logits/rejected": -0.6133081316947937, "logps/chosen": -93.88172149658203, "logps/rejected": -96.00908660888672, "loss": 0.9856, "rewards/accuracies": 0.0, "rewards/chosen": 3.096303701400757, "rewards/margins": -1.8101012706756592, "rewards/rejected": 4.906404972076416, "step": 201 }, { "epoch": 0.04, "learning_rate": 7.426470588235295e-06, "logits/chosen": -0.569211483001709, "logits/rejected": -0.4644588530063629, "logps/chosen": -149.50148010253906, "logps/rejected": -36.87789535522461, "loss": 0.4342, "rewards/accuracies": 1.0, "rewards/chosen": 6.318040370941162, "rewards/margins": 4.877601623535156, "rewards/rejected": 1.4404385089874268, "step": 202 }, { "epoch": 0.04, "learning_rate": 7.463235294117648e-06, "logits/chosen": -0.7246623635292053, "logits/rejected": -0.656237781047821, "logps/chosen": -89.5750961303711, "logps/rejected": -85.48779296875, "loss": 0.9269, "rewards/accuracies": 0.0, "rewards/chosen": 2.3357369899749756, "rewards/margins": -1.4893362522125244, "rewards/rejected": 3.8250732421875, "step": 203 }, { "epoch": 0.05, "learning_rate": 7.500000000000001e-06, "logits/chosen": -1.1219382286071777, "logits/rejected": -0.9967619776725769, "logps/chosen": -138.35311889648438, "logps/rejected": -189.76695251464844, "loss": 1.9129, "rewards/accuracies": 0.0, "rewards/chosen": 5.727813720703125, "rewards/margins": -1.1149826049804688, "rewards/rejected": 6.842796325683594, "step": 204 }, { "epoch": 0.05, "learning_rate": 7.536764705882353e-06, "logits/chosen": -0.7226523160934448, "logits/rejected": -0.6379236578941345, "logps/chosen": -38.52915573120117, "logps/rejected": -27.977645874023438, "loss": 0.5077, "rewards/accuracies": 1.0, "rewards/chosen": 1.5361751317977905, "rewards/margins": 0.9671676158905029, "rewards/rejected": 0.5690075159072876, "step": 205 }, { "epoch": 0.05, "learning_rate": 7.573529411764706e-06, "logits/chosen": -0.6800193786621094, "logits/rejected": -0.6487969756126404, "logps/chosen": -112.06816101074219, "logps/rejected": -87.93132019042969, "loss": 0.1314, "rewards/accuracies": 1.0, "rewards/chosen": 6.116889953613281, "rewards/margins": 2.6134674549102783, "rewards/rejected": 3.503422498703003, "step": 206 }, { "epoch": 0.05, "learning_rate": 7.610294117647059e-06, "logits/chosen": -1.0503965616226196, "logits/rejected": -0.9439051151275635, "logps/chosen": -102.8238296508789, "logps/rejected": -95.32644653320312, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 6.402706146240234, "rewards/margins": 3.2564139366149902, "rewards/rejected": 3.146292209625244, "step": 207 }, { "epoch": 0.05, "learning_rate": 7.647058823529411e-06, "logits/chosen": -0.791691780090332, "logits/rejected": -0.7344197630882263, "logps/chosen": -83.49691772460938, "logps/rejected": -75.82505798339844, "loss": 0.2477, "rewards/accuracies": 1.0, "rewards/chosen": 2.8168365955352783, "rewards/margins": 1.3515061140060425, "rewards/rejected": 1.4653304815292358, "step": 208 }, { "epoch": 0.05, "learning_rate": 7.683823529411766e-06, "logits/chosen": -0.7496179342269897, "logits/rejected": -0.8015008568763733, "logps/chosen": -127.83084869384766, "logps/rejected": -171.04550170898438, "loss": 0.3433, "rewards/accuracies": 1.0, "rewards/chosen": 5.411880016326904, "rewards/margins": 0.030266761779785156, "rewards/rejected": 5.381613254547119, "step": 209 }, { "epoch": 0.05, "learning_rate": 7.720588235294119e-06, "logits/chosen": -0.9442256093025208, "logits/rejected": -0.7891682386398315, "logps/chosen": -83.98321533203125, "logps/rejected": -22.151203155517578, "loss": 0.1508, "rewards/accuracies": 1.0, "rewards/chosen": 3.6394660472869873, "rewards/margins": 1.6654009819030762, "rewards/rejected": 1.9740650653839111, "step": 210 }, { "epoch": 0.05, "learning_rate": 7.757352941176472e-06, "logits/chosen": -0.8124479651451111, "logits/rejected": -0.6147502660751343, "logps/chosen": -207.20974731445312, "logps/rejected": -104.32872009277344, "loss": 0.126, "rewards/accuracies": 1.0, "rewards/chosen": 6.626623630523682, "rewards/margins": 2.3196306228637695, "rewards/rejected": 4.306993007659912, "step": 211 }, { "epoch": 0.05, "learning_rate": 7.794117647058825e-06, "logits/chosen": -0.5651516318321228, "logits/rejected": -0.29227563738822937, "logps/chosen": -120.72833251953125, "logps/rejected": -121.98793029785156, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": 5.113641262054443, "rewards/margins": 2.8415281772613525, "rewards/rejected": 2.272113084793091, "step": 212 }, { "epoch": 0.05, "learning_rate": 7.830882352941177e-06, "logits/chosen": -1.114069938659668, "logits/rejected": -0.8591666221618652, "logps/chosen": -170.94708251953125, "logps/rejected": -73.8760986328125, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 6.687472820281982, "rewards/margins": 4.505950927734375, "rewards/rejected": 2.1815216541290283, "step": 213 }, { "epoch": 0.05, "learning_rate": 7.86764705882353e-06, "logits/chosen": -0.7961596846580505, "logits/rejected": -0.7072523236274719, "logps/chosen": -55.29295349121094, "logps/rejected": -63.60664749145508, "loss": 2.366, "rewards/accuracies": 1.0, "rewards/chosen": 2.4285476207733154, "rewards/margins": 0.49656784534454346, "rewards/rejected": 1.931979775428772, "step": 214 }, { "epoch": 0.05, "learning_rate": 7.904411764705883e-06, "logits/chosen": -0.7121235132217407, "logits/rejected": -0.5509044528007507, "logps/chosen": -97.12323760986328, "logps/rejected": -64.15950775146484, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": 5.028411388397217, "rewards/margins": 3.2600245475769043, "rewards/rejected": 1.7683868408203125, "step": 215 }, { "epoch": 0.05, "learning_rate": 7.941176470588236e-06, "logits/chosen": -0.9993809461593628, "logits/rejected": -1.162668228149414, "logps/chosen": -87.00651550292969, "logps/rejected": -100.80354309082031, "loss": 0.6304, "rewards/accuracies": 0.0, "rewards/chosen": 2.2256622314453125, "rewards/margins": -0.2468094825744629, "rewards/rejected": 2.4724717140197754, "step": 216 }, { "epoch": 0.05, "learning_rate": 7.97794117647059e-06, "logits/chosen": -0.7537182569503784, "logits/rejected": -0.7226974368095398, "logps/chosen": -84.17732238769531, "logps/rejected": -46.939903259277344, "loss": 0.7747, "rewards/accuracies": 1.0, "rewards/chosen": 1.7231277227401733, "rewards/margins": 0.6090644598007202, "rewards/rejected": 1.1140632629394531, "step": 217 }, { "epoch": 0.05, "learning_rate": 8.014705882352942e-06, "logits/chosen": -0.5013160109519958, "logits/rejected": -0.4375461935997009, "logps/chosen": -59.288818359375, "logps/rejected": -58.9549446105957, "loss": 1.5702, "rewards/accuracies": 1.0, "rewards/chosen": 2.7824065685272217, "rewards/margins": 0.3976716995239258, "rewards/rejected": 2.384734869003296, "step": 218 }, { "epoch": 0.05, "learning_rate": 8.051470588235295e-06, "logits/chosen": -0.7022393345832825, "logits/rejected": -0.6457513570785522, "logps/chosen": -50.01953125, "logps/rejected": -84.47920989990234, "loss": 0.1824, "rewards/accuracies": 1.0, "rewards/chosen": 2.672720432281494, "rewards/margins": 0.8277565240859985, "rewards/rejected": 1.8449639081954956, "step": 219 }, { "epoch": 0.05, "learning_rate": 8.088235294117648e-06, "logits/chosen": -0.953034520149231, "logits/rejected": -0.8488571047782898, "logps/chosen": -93.47261047363281, "logps/rejected": -51.748748779296875, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": 5.065809726715088, "rewards/margins": 2.700451612472534, "rewards/rejected": 2.3653581142425537, "step": 220 }, { "epoch": 0.05, "learning_rate": 8.125000000000001e-06, "logits/chosen": -1.000974178314209, "logits/rejected": -0.8754360675811768, "logps/chosen": -92.73652648925781, "logps/rejected": -61.062950134277344, "loss": 0.2419, "rewards/accuracies": 1.0, "rewards/chosen": 4.100796699523926, "rewards/margins": 1.0343866348266602, "rewards/rejected": 3.0664100646972656, "step": 221 }, { "epoch": 0.05, "learning_rate": 8.161764705882354e-06, "logits/chosen": -0.6621544361114502, "logits/rejected": -0.6059238314628601, "logps/chosen": -44.34312057495117, "logps/rejected": -74.12811279296875, "loss": 0.4596, "rewards/accuracies": 0.0, "rewards/chosen": 2.2281880378723145, "rewards/margins": -0.36680102348327637, "rewards/rejected": 2.594989061355591, "step": 222 }, { "epoch": 0.05, "learning_rate": 8.198529411764707e-06, "logits/chosen": -0.6752831339836121, "logits/rejected": -0.6010421514511108, "logps/chosen": -102.87511444091797, "logps/rejected": -62.10005187988281, "loss": 0.385, "rewards/accuracies": 1.0, "rewards/chosen": 3.6822030544281006, "rewards/margins": 1.0918121337890625, "rewards/rejected": 2.590390920639038, "step": 223 }, { "epoch": 0.05, "learning_rate": 8.23529411764706e-06, "logits/chosen": -1.0519440174102783, "logits/rejected": -0.8023979067802429, "logps/chosen": -186.46444702148438, "logps/rejected": -49.4288215637207, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": 5.504815578460693, "rewards/margins": 3.239433526992798, "rewards/rejected": 2.2653820514678955, "step": 224 }, { "epoch": 0.05, "learning_rate": 8.272058823529413e-06, "logits/chosen": -1.1487562656402588, "logits/rejected": -0.914289653301239, "logps/chosen": -155.05044555664062, "logps/rejected": -94.04718017578125, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": 7.4215240478515625, "rewards/margins": 5.812375068664551, "rewards/rejected": 1.6091492176055908, "step": 225 }, { "epoch": 0.05, "learning_rate": 8.308823529411766e-06, "logits/chosen": -1.0680925846099854, "logits/rejected": -0.9482945203781128, "logps/chosen": -46.24665832519531, "logps/rejected": -58.37206268310547, "loss": 0.3059, "rewards/accuracies": 1.0, "rewards/chosen": 2.553478956222534, "rewards/margins": 0.2091071605682373, "rewards/rejected": 2.344371795654297, "step": 226 }, { "epoch": 0.05, "learning_rate": 8.345588235294119e-06, "logits/chosen": -0.9122298955917358, "logits/rejected": -0.7410547137260437, "logps/chosen": -73.86959838867188, "logps/rejected": -17.736059188842773, "loss": 0.5518, "rewards/accuracies": 1.0, "rewards/chosen": 2.9474685192108154, "rewards/margins": 2.0929391384124756, "rewards/rejected": 0.8545293807983398, "step": 227 }, { "epoch": 0.05, "learning_rate": 8.382352941176472e-06, "logits/chosen": -1.1992560625076294, "logits/rejected": -0.9078497290611267, "logps/chosen": -132.79757690429688, "logps/rejected": -65.16104888916016, "loss": 0.5558, "rewards/accuracies": 1.0, "rewards/chosen": 4.172430515289307, "rewards/margins": 2.4708092212677, "rewards/rejected": 1.7016212940216064, "step": 228 }, { "epoch": 0.05, "learning_rate": 8.419117647058824e-06, "logits/chosen": -1.3818943500518799, "logits/rejected": -1.3500416278839111, "logps/chosen": -67.42118835449219, "logps/rejected": -51.349395751953125, "loss": 0.5003, "rewards/accuracies": 0.0, "rewards/chosen": 3.1135787963867188, "rewards/margins": -0.07281279563903809, "rewards/rejected": 3.186391592025757, "step": 229 }, { "epoch": 0.05, "learning_rate": 8.455882352941177e-06, "logits/chosen": -0.8136378526687622, "logits/rejected": -0.6033999919891357, "logps/chosen": -69.67890167236328, "logps/rejected": -11.027908325195312, "loss": 0.5743, "rewards/accuracies": 1.0, "rewards/chosen": 2.3514626026153564, "rewards/margins": 1.9865541458129883, "rewards/rejected": 0.36490851640701294, "step": 230 }, { "epoch": 0.05, "learning_rate": 8.49264705882353e-06, "logits/chosen": -0.6408169269561768, "logits/rejected": -0.6289994120597839, "logps/chosen": -72.32559204101562, "logps/rejected": -114.7309341430664, "loss": 0.5627, "rewards/accuracies": 0.0, "rewards/chosen": 3.3110756874084473, "rewards/margins": -0.18947291374206543, "rewards/rejected": 3.5005486011505127, "step": 231 }, { "epoch": 0.05, "learning_rate": 8.529411764705883e-06, "logits/chosen": -0.5816882848739624, "logits/rejected": -0.489219605922699, "logps/chosen": -78.75807189941406, "logps/rejected": -77.17181396484375, "loss": 0.3759, "rewards/accuracies": 1.0, "rewards/chosen": 2.2827980518341064, "rewards/margins": 0.8643302917480469, "rewards/rejected": 1.4184677600860596, "step": 232 }, { "epoch": 0.05, "learning_rate": 8.566176470588236e-06, "logits/chosen": -0.6478106379508972, "logits/rejected": -0.6844859719276428, "logps/chosen": -78.23534393310547, "logps/rejected": -115.42715454101562, "loss": 1.1091, "rewards/accuracies": 0.0, "rewards/chosen": 2.200188398361206, "rewards/margins": -0.3726844787597656, "rewards/rejected": 2.5728728771209717, "step": 233 }, { "epoch": 0.05, "learning_rate": 8.60294117647059e-06, "logits/chosen": -0.5519433617591858, "logits/rejected": -0.6195653080940247, "logps/chosen": -66.20501708984375, "logps/rejected": -93.34510803222656, "loss": 1.9388, "rewards/accuracies": 0.0, "rewards/chosen": 3.878840684890747, "rewards/margins": -2.8879106044769287, "rewards/rejected": 6.766751289367676, "step": 234 }, { "epoch": 0.05, "learning_rate": 8.639705882352942e-06, "logits/chosen": -0.753578245639801, "logits/rejected": -0.6718990206718445, "logps/chosen": -50.182098388671875, "logps/rejected": -75.57794189453125, "loss": 1.5043, "rewards/accuracies": 0.0, "rewards/chosen": 1.7610901594161987, "rewards/margins": -2.8986268043518066, "rewards/rejected": 4.659717082977295, "step": 235 }, { "epoch": 0.05, "learning_rate": 8.676470588235295e-06, "logits/chosen": -0.9243901968002319, "logits/rejected": -0.7536441683769226, "logps/chosen": -147.01605224609375, "logps/rejected": -62.224491119384766, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 3.151986837387085, "rewards/margins": 0.5609912872314453, "rewards/rejected": 2.5909955501556396, "step": 236 }, { "epoch": 0.05, "learning_rate": 8.713235294117648e-06, "logits/chosen": -0.7019226551055908, "logits/rejected": -0.7019226551055908, "logps/chosen": -54.31983184814453, "logps/rejected": -54.31983184814453, "loss": 0.6544, "rewards/accuracies": 0.0, "rewards/chosen": 2.790786027908325, "rewards/margins": 0.0, "rewards/rejected": 2.790786027908325, "step": 237 }, { "epoch": 0.05, "learning_rate": 8.750000000000001e-06, "logits/chosen": -0.9066272974014282, "logits/rejected": -0.8462355136871338, "logps/chosen": -48.512603759765625, "logps/rejected": -48.81183624267578, "loss": 0.2573, "rewards/accuracies": 1.0, "rewards/chosen": 2.6154251098632812, "rewards/margins": 0.44126343727111816, "rewards/rejected": 2.174161672592163, "step": 238 }, { "epoch": 0.05, "learning_rate": 8.786764705882354e-06, "logits/chosen": -0.526681125164032, "logits/rejected": -0.526681125164032, "logps/chosen": -58.115753173828125, "logps/rejected": -58.115753173828125, "loss": 0.3501, "rewards/accuracies": 0.0, "rewards/chosen": 1.024250864982605, "rewards/margins": 0.0, "rewards/rejected": 1.024250864982605, "step": 239 }, { "epoch": 0.05, "learning_rate": 8.823529411764707e-06, "logits/chosen": -0.7861285209655762, "logits/rejected": -0.7312017679214478, "logps/chosen": -93.71565246582031, "logps/rejected": -144.8107452392578, "loss": 1.6631, "rewards/accuracies": 0.0, "rewards/chosen": 4.2075700759887695, "rewards/margins": -1.9229493141174316, "rewards/rejected": 6.130519390106201, "step": 240 }, { "epoch": 0.05, "learning_rate": 8.86029411764706e-06, "logits/chosen": -0.5977934002876282, "logits/rejected": -0.6127896308898926, "logps/chosen": -44.35197448730469, "logps/rejected": -62.456844329833984, "loss": 0.8341, "rewards/accuracies": 0.0, "rewards/chosen": 2.412360429763794, "rewards/margins": -1.1211254596710205, "rewards/rejected": 3.5334858894348145, "step": 241 }, { "epoch": 0.05, "learning_rate": 8.897058823529413e-06, "logits/chosen": -1.1733819246292114, "logits/rejected": -1.132069706916809, "logps/chosen": -62.62742614746094, "logps/rejected": -78.35459899902344, "loss": 0.0924, "rewards/accuracies": 1.0, "rewards/chosen": 4.166404724121094, "rewards/margins": 2.1827406883239746, "rewards/rejected": 1.9836639165878296, "step": 242 }, { "epoch": 0.05, "learning_rate": 8.933823529411766e-06, "logits/chosen": -0.46218636631965637, "logits/rejected": -0.15886935591697693, "logps/chosen": -62.49578857421875, "logps/rejected": -45.30828094482422, "loss": 0.164, "rewards/accuracies": 1.0, "rewards/chosen": 1.9169037342071533, "rewards/margins": 0.9517200589179993, "rewards/rejected": 0.965183675289154, "step": 243 }, { "epoch": 0.05, "learning_rate": 8.970588235294119e-06, "logits/chosen": -0.7473495006561279, "logits/rejected": -0.8791915774345398, "logps/chosen": -78.71994018554688, "logps/rejected": -60.55509948730469, "loss": 0.2241, "rewards/accuracies": 1.0, "rewards/chosen": 2.9746367931365967, "rewards/margins": 0.6057281494140625, "rewards/rejected": 2.368908643722534, "step": 244 }, { "epoch": 0.05, "learning_rate": 9.007352941176471e-06, "logits/chosen": -0.39131447672843933, "logits/rejected": -0.2796817421913147, "logps/chosen": -33.850868225097656, "logps/rejected": -46.126609802246094, "loss": 0.7211, "rewards/accuracies": 0.0, "rewards/chosen": 1.2527207136154175, "rewards/margins": -0.8591865301132202, "rewards/rejected": 2.1119072437286377, "step": 245 }, { "epoch": 0.05, "learning_rate": 9.044117647058824e-06, "logits/chosen": -0.9979109168052673, "logits/rejected": -0.8139781951904297, "logps/chosen": -141.60562133789062, "logps/rejected": -114.6822509765625, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 8.089903831481934, "rewards/margins": 3.317209243774414, "rewards/rejected": 4.7726945877075195, "step": 246 }, { "epoch": 0.05, "learning_rate": 9.080882352941177e-06, "logits/chosen": -0.4989129900932312, "logits/rejected": -0.42044225335121155, "logps/chosen": -47.44873809814453, "logps/rejected": -44.295597076416016, "loss": 2.3478, "rewards/accuracies": 0.0, "rewards/chosen": 1.2759650945663452, "rewards/margins": -1.7250522375106812, "rewards/rejected": 3.0010173320770264, "step": 247 }, { "epoch": 0.05, "learning_rate": 9.11764705882353e-06, "logits/chosen": -1.210329294204712, "logits/rejected": -1.0516067743301392, "logps/chosen": -76.35741424560547, "logps/rejected": -90.77245330810547, "loss": 1.9924, "rewards/accuracies": 0.0, "rewards/chosen": 2.235218048095703, "rewards/margins": -1.8704195022583008, "rewards/rejected": 4.105637550354004, "step": 248 }, { "epoch": 0.06, "learning_rate": 9.154411764705883e-06, "logits/chosen": -0.8709240555763245, "logits/rejected": -0.9794121980667114, "logps/chosen": -45.36254119873047, "logps/rejected": -81.07281494140625, "loss": 2.1961, "rewards/accuracies": 0.0, "rewards/chosen": 1.6539623737335205, "rewards/margins": -3.7581746578216553, "rewards/rejected": 5.412137031555176, "step": 249 }, { "epoch": 0.06, "learning_rate": 9.191176470588236e-06, "logits/chosen": -0.821828305721283, "logits/rejected": -0.74835205078125, "logps/chosen": -52.42012023925781, "logps/rejected": -25.670791625976562, "loss": 0.1147, "rewards/accuracies": 1.0, "rewards/chosen": 2.541461944580078, "rewards/margins": 1.9400895833969116, "rewards/rejected": 0.6013723611831665, "step": 250 }, { "epoch": 0.06, "learning_rate": 9.227941176470589e-06, "logits/chosen": -0.9191968441009521, "logits/rejected": -0.9218922853469849, "logps/chosen": -103.13037109375, "logps/rejected": -46.081939697265625, "loss": 0.7799, "rewards/accuracies": 1.0, "rewards/chosen": 3.2523727416992188, "rewards/margins": 0.700347900390625, "rewards/rejected": 2.5520248413085938, "step": 251 }, { "epoch": 0.06, "learning_rate": 9.264705882352942e-06, "logits/chosen": -0.7156630158424377, "logits/rejected": -0.7898976802825928, "logps/chosen": -51.588661193847656, "logps/rejected": -118.47432708740234, "loss": 0.6883, "rewards/accuracies": 0.0, "rewards/chosen": 2.5362441539764404, "rewards/margins": -1.0470209121704102, "rewards/rejected": 3.5832650661468506, "step": 252 }, { "epoch": 0.06, "learning_rate": 9.301470588235295e-06, "logits/chosen": -0.7216135859489441, "logits/rejected": -0.6134456396102905, "logps/chosen": -95.27656555175781, "logps/rejected": -60.35407257080078, "loss": 0.1441, "rewards/accuracies": 1.0, "rewards/chosen": 2.0875213146209717, "rewards/margins": 1.09748375415802, "rewards/rejected": 0.9900375604629517, "step": 253 }, { "epoch": 0.06, "learning_rate": 9.338235294117648e-06, "logits/chosen": -0.5660054683685303, "logits/rejected": -0.5780116319656372, "logps/chosen": -80.8218002319336, "logps/rejected": -134.12045288085938, "loss": 0.5336, "rewards/accuracies": 0.0, "rewards/chosen": 3.9942307472229004, "rewards/margins": -0.6052641868591309, "rewards/rejected": 4.599494934082031, "step": 254 }, { "epoch": 0.06, "learning_rate": 9.375000000000001e-06, "logits/chosen": -1.0062286853790283, "logits/rejected": -0.808822512626648, "logps/chosen": -95.43157958984375, "logps/rejected": -96.12702178955078, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": 4.870428562164307, "rewards/margins": 2.5526559352874756, "rewards/rejected": 2.317772626876831, "step": 255 }, { "epoch": 0.06, "learning_rate": 9.411764705882354e-06, "logits/chosen": -0.878109872341156, "logits/rejected": -1.3638759851455688, "logps/chosen": -65.50064086914062, "logps/rejected": -32.60120391845703, "loss": 0.1246, "rewards/accuracies": 1.0, "rewards/chosen": 2.1796586513519287, "rewards/margins": 1.503505825996399, "rewards/rejected": 0.6761528253555298, "step": 256 }, { "epoch": 0.06, "learning_rate": 9.448529411764707e-06, "logits/chosen": -0.8270859718322754, "logits/rejected": -0.7534613013267517, "logps/chosen": -62.84870147705078, "logps/rejected": -190.1297149658203, "loss": 1.9791, "rewards/accuracies": 0.0, "rewards/chosen": 1.6355575323104858, "rewards/margins": -3.917384147644043, "rewards/rejected": 5.552941799163818, "step": 257 }, { "epoch": 0.06, "learning_rate": 9.48529411764706e-06, "logits/chosen": -0.9635130167007446, "logits/rejected": -0.9635130167007446, "logps/chosen": -66.73211669921875, "logps/rejected": -66.73211669921875, "loss": 1.0212, "rewards/accuracies": 0.0, "rewards/chosen": 1.6443299055099487, "rewards/margins": 0.0, "rewards/rejected": 1.6443299055099487, "step": 258 }, { "epoch": 0.06, "learning_rate": 9.522058823529413e-06, "logits/chosen": -1.180136799812317, "logits/rejected": -1.0857630968093872, "logps/chosen": -115.04624938964844, "logps/rejected": -184.23489379882812, "loss": 0.8535, "rewards/accuracies": 0.0, "rewards/chosen": 4.3647966384887695, "rewards/margins": -1.4694929122924805, "rewards/rejected": 5.83428955078125, "step": 259 }, { "epoch": 0.06, "learning_rate": 9.558823529411766e-06, "logits/chosen": -0.5407373905181885, "logits/rejected": -0.2552682161331177, "logps/chosen": -95.50241088867188, "logps/rejected": -28.766923904418945, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": 3.6993470191955566, "rewards/margins": 3.3416357040405273, "rewards/rejected": 0.35771122574806213, "step": 260 }, { "epoch": 0.06, "learning_rate": 9.595588235294119e-06, "logits/chosen": -0.8125731348991394, "logits/rejected": -0.8125731348991394, "logps/chosen": -83.03770446777344, "logps/rejected": -83.03770446777344, "loss": 0.3959, "rewards/accuracies": 0.0, "rewards/chosen": 2.0657334327697754, "rewards/margins": 0.0, "rewards/rejected": 2.0657334327697754, "step": 261 }, { "epoch": 0.06, "learning_rate": 9.632352941176471e-06, "logits/chosen": -0.556330680847168, "logits/rejected": -0.5119286179542542, "logps/chosen": -53.523799896240234, "logps/rejected": -55.58931350708008, "loss": 2.6094, "rewards/accuracies": 1.0, "rewards/chosen": 1.6518970727920532, "rewards/margins": 0.2933257818222046, "rewards/rejected": 1.3585712909698486, "step": 262 }, { "epoch": 0.06, "learning_rate": 9.669117647058824e-06, "logits/chosen": -0.4241598844528198, "logits/rejected": -0.35758641362190247, "logps/chosen": -24.631649017333984, "logps/rejected": -4.1829729080200195, "loss": 0.3866, "rewards/accuracies": 1.0, "rewards/chosen": 0.8213794827461243, "rewards/margins": 0.3587431013584137, "rewards/rejected": 0.46263638138771057, "step": 263 }, { "epoch": 0.06, "learning_rate": 9.705882352941177e-06, "logits/chosen": -1.046912670135498, "logits/rejected": -1.0311081409454346, "logps/chosen": -76.74494934082031, "logps/rejected": -81.18445587158203, "loss": 1.03, "rewards/accuracies": 0.0, "rewards/chosen": 1.38524329662323, "rewards/margins": -1.712602972984314, "rewards/rejected": 3.097846269607544, "step": 264 }, { "epoch": 0.06, "learning_rate": 9.74264705882353e-06, "logits/chosen": -0.9786429405212402, "logits/rejected": -0.7934418320655823, "logps/chosen": -111.20184326171875, "logps/rejected": -84.65701293945312, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 6.794134616851807, "rewards/margins": 3.1000709533691406, "rewards/rejected": 3.694063663482666, "step": 265 }, { "epoch": 0.06, "learning_rate": 9.779411764705883e-06, "logits/chosen": -0.8527916669845581, "logits/rejected": -0.5497974157333374, "logps/chosen": -147.4515838623047, "logps/rejected": -54.892303466796875, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 5.454272747039795, "rewards/margins": 2.84336256980896, "rewards/rejected": 2.610910177230835, "step": 266 }, { "epoch": 0.06, "learning_rate": 9.816176470588236e-06, "logits/chosen": -0.7159310579299927, "logits/rejected": -0.5447282195091248, "logps/chosen": -48.71993637084961, "logps/rejected": -49.419837951660156, "loss": 1.5367, "rewards/accuracies": 0.0, "rewards/chosen": 1.2656391859054565, "rewards/margins": -2.3578786849975586, "rewards/rejected": 3.6235177516937256, "step": 267 }, { "epoch": 0.06, "learning_rate": 9.852941176470589e-06, "logits/chosen": -0.7789899110794067, "logits/rejected": -0.5129631161689758, "logps/chosen": -110.22201538085938, "logps/rejected": -29.113731384277344, "loss": 0.5115, "rewards/accuracies": 1.0, "rewards/chosen": 5.611711025238037, "rewards/margins": 5.036139011383057, "rewards/rejected": 0.5755718350410461, "step": 268 }, { "epoch": 0.06, "learning_rate": 9.889705882352942e-06, "logits/chosen": -0.6933237910270691, "logits/rejected": -0.874915361404419, "logps/chosen": -27.746185302734375, "logps/rejected": -53.53874206542969, "loss": 1.852, "rewards/accuracies": 0.0, "rewards/chosen": 1.0851821899414062, "rewards/margins": -2.0690269470214844, "rewards/rejected": 3.1542091369628906, "step": 269 }, { "epoch": 0.06, "learning_rate": 9.926470588235295e-06, "logits/chosen": -1.1539803743362427, "logits/rejected": -1.0242854356765747, "logps/chosen": -61.50498962402344, "logps/rejected": -55.570220947265625, "loss": 0.3107, "rewards/accuracies": 1.0, "rewards/chosen": 1.1735687255859375, "rewards/margins": 0.2471511960029602, "rewards/rejected": 0.9264175295829773, "step": 270 }, { "epoch": 0.06, "learning_rate": 9.963235294117648e-06, "logits/chosen": -1.134114384651184, "logits/rejected": -0.8443987369537354, "logps/chosen": -164.55650329589844, "logps/rejected": -89.78739166259766, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 4.316183567047119, "rewards/margins": 2.3657312393188477, "rewards/rejected": 1.950452446937561, "step": 271 }, { "epoch": 0.06, "learning_rate": 1e-05, "logits/chosen": -1.0309709310531616, "logits/rejected": -0.8994536995887756, "logps/chosen": -131.50765991210938, "logps/rejected": -37.190277099609375, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 5.684930324554443, "rewards/margins": 3.399186611175537, "rewards/rejected": 2.2857437133789062, "step": 272 }, { "epoch": 0.06, "learning_rate": 9.99999967875601e-06, "logits/chosen": -0.6259334087371826, "logits/rejected": -0.5051776170730591, "logps/chosen": -65.63638305664062, "logps/rejected": -10.149474143981934, "loss": 0.4043, "rewards/accuracies": 1.0, "rewards/chosen": 2.3792855739593506, "rewards/margins": 1.2259796857833862, "rewards/rejected": 1.1533058881759644, "step": 273 }, { "epoch": 0.06, "learning_rate": 9.999998715024082e-06, "logits/chosen": -1.2321844100952148, "logits/rejected": -1.1745872497558594, "logps/chosen": -75.29187774658203, "logps/rejected": -81.86738586425781, "loss": 0.5345, "rewards/accuracies": 0.0, "rewards/chosen": 2.2691657543182373, "rewards/margins": -0.31418609619140625, "rewards/rejected": 2.5833518505096436, "step": 274 }, { "epoch": 0.06, "learning_rate": 9.999997108804337e-06, "logits/chosen": -0.6665979623794556, "logits/rejected": -0.6665979623794556, "logps/chosen": -75.84907531738281, "logps/rejected": -75.84907531738281, "loss": 0.5624, "rewards/accuracies": 0.0, "rewards/chosen": 2.015068769454956, "rewards/margins": 0.0, "rewards/rejected": 2.015068769454956, "step": 275 }, { "epoch": 0.06, "learning_rate": 9.999994860096985e-06, "logits/chosen": -0.9268151521682739, "logits/rejected": -0.8566977381706238, "logps/chosen": -44.41648864746094, "logps/rejected": -71.40478515625, "loss": 0.2554, "rewards/accuracies": 1.0, "rewards/chosen": 3.4503097534179688, "rewards/margins": 0.4923841953277588, "rewards/rejected": 2.95792555809021, "step": 276 }, { "epoch": 0.06, "learning_rate": 9.99999196890231e-06, "logits/chosen": -1.0234547853469849, "logits/rejected": -0.7519076466560364, "logps/chosen": -244.68829345703125, "logps/rejected": -24.148109436035156, "loss": 0.3679, "rewards/accuracies": 1.0, "rewards/chosen": 6.266720771789551, "rewards/margins": 5.613376617431641, "rewards/rejected": 0.6533443331718445, "step": 277 }, { "epoch": 0.06, "learning_rate": 9.999988435220688e-06, "logits/chosen": -0.5601076483726501, "logits/rejected": -0.29593586921691895, "logps/chosen": -58.14442825317383, "logps/rejected": -64.28379821777344, "loss": 0.1051, "rewards/accuracies": 1.0, "rewards/chosen": 5.256274700164795, "rewards/margins": 2.567936897277832, "rewards/rejected": 2.688337802886963, "step": 278 }, { "epoch": 0.06, "learning_rate": 9.999984259052573e-06, "logits/chosen": -1.131658911705017, "logits/rejected": -1.0748705863952637, "logps/chosen": -48.48279571533203, "logps/rejected": -38.90165710449219, "loss": 0.7449, "rewards/accuracies": 0.0, "rewards/chosen": 1.777910590171814, "rewards/margins": -1.1621063947677612, "rewards/rejected": 2.940016984939575, "step": 279 }, { "epoch": 0.06, "learning_rate": 9.9999794403985e-06, "logits/chosen": -1.0920095443725586, "logits/rejected": -1.1040852069854736, "logps/chosen": -53.831912994384766, "logps/rejected": -68.69371032714844, "loss": 0.4276, "rewards/accuracies": 0.0, "rewards/chosen": 1.729735255241394, "rewards/margins": -0.2994624376296997, "rewards/rejected": 2.0291976928710938, "step": 280 }, { "epoch": 0.06, "learning_rate": 9.999973979259088e-06, "logits/chosen": -0.9833388328552246, "logits/rejected": -0.9158704876899719, "logps/chosen": -45.85148620605469, "logps/rejected": -43.9983024597168, "loss": 1.057, "rewards/accuracies": 1.0, "rewards/chosen": 2.1485657691955566, "rewards/margins": 0.38286292552948, "rewards/rejected": 1.7657028436660767, "step": 281 }, { "epoch": 0.06, "learning_rate": 9.99996787563504e-06, "logits/chosen": -0.8753253817558289, "logits/rejected": -0.7754197716712952, "logps/chosen": -156.3040771484375, "logps/rejected": -48.29625701904297, "loss": 0.2135, "rewards/accuracies": 1.0, "rewards/chosen": 4.20346212387085, "rewards/margins": 0.9642095565795898, "rewards/rejected": 3.2392525672912598, "step": 282 }, { "epoch": 0.06, "learning_rate": 9.999961129527139e-06, "logits/chosen": -0.7374592423439026, "logits/rejected": -0.6877994537353516, "logps/chosen": -57.447322845458984, "logps/rejected": -57.13850402832031, "loss": 0.3936, "rewards/accuracies": 0.0, "rewards/chosen": 1.577768325805664, "rewards/margins": -0.09278225898742676, "rewards/rejected": 1.6705505847930908, "step": 283 }, { "epoch": 0.06, "learning_rate": 9.999953740936252e-06, "logits/chosen": -0.5903159976005554, "logits/rejected": -0.5326692461967468, "logps/chosen": -69.83118438720703, "logps/rejected": -69.9915542602539, "loss": 1.1012, "rewards/accuracies": 0.0, "rewards/chosen": 0.9569229483604431, "rewards/margins": -2.0810317993164062, "rewards/rejected": 3.037954807281494, "step": 284 }, { "epoch": 0.06, "learning_rate": 9.99994570986333e-06, "logits/chosen": -0.7927528619766235, "logits/rejected": -0.7604236006736755, "logps/chosen": -107.29705047607422, "logps/rejected": -159.47723388671875, "loss": 0.8877, "rewards/accuracies": 0.0, "rewards/chosen": 5.932048797607422, "rewards/margins": -1.5649909973144531, "rewards/rejected": 7.497039794921875, "step": 285 }, { "epoch": 0.06, "learning_rate": 9.999937036309402e-06, "logits/chosen": -1.2237523794174194, "logits/rejected": -1.2345725297927856, "logps/chosen": -59.52238464355469, "logps/rejected": -93.23530578613281, "loss": 3.0136, "rewards/accuracies": 0.0, "rewards/chosen": 2.963510274887085, "rewards/margins": -4.71904182434082, "rewards/rejected": 7.682551860809326, "step": 286 }, { "epoch": 0.06, "learning_rate": 9.999927720275586e-06, "logits/chosen": -1.040673851966858, "logits/rejected": -0.9863672256469727, "logps/chosen": -202.3381805419922, "logps/rejected": -78.49738311767578, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 7.362806797027588, "rewards/margins": 4.335740089416504, "rewards/rejected": 3.027066946029663, "step": 287 }, { "epoch": 0.06, "learning_rate": 9.999917761763076e-06, "logits/chosen": -0.6435728669166565, "logits/rejected": -0.38236987590789795, "logps/chosen": -65.68670654296875, "logps/rejected": -15.011085510253906, "loss": 0.2413, "rewards/accuracies": 1.0, "rewards/chosen": 1.8087371587753296, "rewards/margins": 1.5252854824066162, "rewards/rejected": 0.283451646566391, "step": 288 }, { "epoch": 0.06, "learning_rate": 9.999907160773155e-06, "logits/chosen": -0.31398800015449524, "logits/rejected": -0.3692780137062073, "logps/chosen": -60.51097106933594, "logps/rejected": -76.23136901855469, "loss": 1.3014, "rewards/accuracies": 0.0, "rewards/chosen": 2.0987396240234375, "rewards/margins": -1.4635238647460938, "rewards/rejected": 3.5622634887695312, "step": 289 }, { "epoch": 0.06, "learning_rate": 9.99989591730718e-06, "logits/chosen": -0.7584868669509888, "logits/rejected": -0.6385793089866638, "logps/chosen": -133.29034423828125, "logps/rejected": -113.73640441894531, "loss": 0.4327, "rewards/accuracies": 0.0, "rewards/chosen": 4.406353950500488, "rewards/margins": -0.27542543411254883, "rewards/rejected": 4.681779384613037, "step": 290 }, { "epoch": 0.06, "learning_rate": 9.999884031366603e-06, "logits/chosen": -0.9668216705322266, "logits/rejected": -0.8920188546180725, "logps/chosen": -63.76808166503906, "logps/rejected": -81.00260925292969, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": 3.149919271469116, "rewards/margins": 2.4196908473968506, "rewards/rejected": 0.7302284240722656, "step": 291 }, { "epoch": 0.06, "learning_rate": 9.999871502952944e-06, "logits/chosen": -0.8904373049736023, "logits/rejected": -0.6065946817398071, "logps/chosen": -131.74899291992188, "logps/rejected": -19.149871826171875, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 4.471490383148193, "rewards/margins": 3.959451198577881, "rewards/rejected": 0.5120391845703125, "step": 292 }, { "epoch": 0.06, "learning_rate": 9.99985833206782e-06, "logits/chosen": -0.8363608121871948, "logits/rejected": -0.7196695804595947, "logps/chosen": -53.42871856689453, "logps/rejected": -54.493247985839844, "loss": 0.4249, "rewards/accuracies": 1.0, "rewards/chosen": 1.951680064201355, "rewards/margins": 1.4112129211425781, "rewards/rejected": 0.5404670834541321, "step": 293 }, { "epoch": 0.07, "learning_rate": 9.999844518712917e-06, "logits/chosen": -0.6037253141403198, "logits/rejected": -0.6037253141403198, "logps/chosen": -82.15094757080078, "logps/rejected": -82.15094757080078, "loss": 0.3482, "rewards/accuracies": 0.0, "rewards/chosen": 1.8490288257598877, "rewards/margins": 0.0, "rewards/rejected": 1.8490288257598877, "step": 294 }, { "epoch": 0.07, "learning_rate": 9.999830062890012e-06, "logits/chosen": -0.5899395942687988, "logits/rejected": -0.5701883435249329, "logps/chosen": -70.64962768554688, "logps/rejected": -41.09034729003906, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 1.4918739795684814, "rewards/margins": -1.0836799144744873, "rewards/rejected": 2.5755538940429688, "step": 295 }, { "epoch": 0.07, "learning_rate": 9.999814964600965e-06, "logits/chosen": -0.7303915619850159, "logits/rejected": -0.7028882503509521, "logps/chosen": -54.383384704589844, "logps/rejected": -62.85866928100586, "loss": 0.3251, "rewards/accuracies": 1.0, "rewards/chosen": 2.958664655685425, "rewards/margins": 0.4546937942504883, "rewards/rejected": 2.5039708614349365, "step": 296 }, { "epoch": 0.07, "learning_rate": 9.999799223847714e-06, "logits/chosen": -0.8812035322189331, "logits/rejected": -0.8338280320167542, "logps/chosen": -65.61961364746094, "logps/rejected": -56.964656829833984, "loss": 0.6797, "rewards/accuracies": 1.0, "rewards/chosen": 3.3760910034179688, "rewards/margins": 0.8898189067840576, "rewards/rejected": 2.486272096633911, "step": 297 }, { "epoch": 0.07, "learning_rate": 9.999782840632281e-06, "logits/chosen": -0.9958134889602661, "logits/rejected": -0.8630585074424744, "logps/chosen": -49.16426086425781, "logps/rejected": -26.644044876098633, "loss": 0.8814, "rewards/accuracies": 1.0, "rewards/chosen": 2.5399169921875, "rewards/margins": 2.2039592266082764, "rewards/rejected": 0.33595773577690125, "step": 298 }, { "epoch": 0.07, "learning_rate": 9.999765814956771e-06, "logits/chosen": -0.7856517434120178, "logits/rejected": -0.7272389531135559, "logps/chosen": -49.285762786865234, "logps/rejected": -70.93008422851562, "loss": 1.1502, "rewards/accuracies": 1.0, "rewards/chosen": 2.4911084175109863, "rewards/margins": 0.5816754102706909, "rewards/rejected": 1.9094330072402954, "step": 299 }, { "epoch": 0.07, "learning_rate": 9.999748146823376e-06, "logits/chosen": -0.7371383905410767, "logits/rejected": -0.6998206377029419, "logps/chosen": -110.85060119628906, "logps/rejected": -54.60508728027344, "loss": 0.3427, "rewards/accuracies": 1.0, "rewards/chosen": 3.785060167312622, "rewards/margins": 0.30803370475769043, "rewards/rejected": 3.4770264625549316, "step": 300 }, { "epoch": 0.07, "learning_rate": 9.999729836234363e-06, "logits/chosen": -1.07172691822052, "logits/rejected": -1.053321123123169, "logps/chosen": -56.79887008666992, "logps/rejected": -42.09619903564453, "loss": 0.7324, "rewards/accuracies": 0.0, "rewards/chosen": 1.3651478290557861, "rewards/margins": -1.1643757820129395, "rewards/rejected": 2.5295236110687256, "step": 301 }, { "epoch": 0.07, "learning_rate": 9.999710883192082e-06, "logits/chosen": -1.20352041721344, "logits/rejected": -1.1025315523147583, "logps/chosen": -55.35881423950195, "logps/rejected": -64.18508911132812, "loss": 1.6718, "rewards/accuracies": 1.0, "rewards/chosen": 3.3988964557647705, "rewards/margins": 0.4835948944091797, "rewards/rejected": 2.915301561355591, "step": 302 }, { "epoch": 0.07, "learning_rate": 9.999691287698975e-06, "logits/chosen": -0.9233021140098572, "logits/rejected": -0.8284326195716858, "logps/chosen": -69.83164978027344, "logps/rejected": -53.10004425048828, "loss": 0.2374, "rewards/accuracies": 1.0, "rewards/chosen": 3.725959062576294, "rewards/margins": 1.4043769836425781, "rewards/rejected": 2.321582078933716, "step": 303 }, { "epoch": 0.07, "learning_rate": 9.999671049757554e-06, "logits/chosen": -1.0810595750808716, "logits/rejected": -0.8753198981285095, "logps/chosen": -93.20897674560547, "logps/rejected": -30.710540771484375, "loss": 1.7667, "rewards/accuracies": 1.0, "rewards/chosen": 3.544396162033081, "rewards/margins": 3.3930068016052246, "rewards/rejected": 0.15138931572437286, "step": 304 }, { "epoch": 0.07, "learning_rate": 9.999650169370423e-06, "logits/chosen": -0.8862051963806152, "logits/rejected": -0.8488323092460632, "logps/chosen": -92.28251647949219, "logps/rejected": -147.635986328125, "loss": 0.7539, "rewards/accuracies": 0.0, "rewards/chosen": 3.98130202293396, "rewards/margins": -1.2544310092926025, "rewards/rejected": 5.2357330322265625, "step": 305 }, { "epoch": 0.07, "learning_rate": 9.999628646540262e-06, "logits/chosen": -1.1172723770141602, "logits/rejected": -1.1107345819473267, "logps/chosen": -58.59980010986328, "logps/rejected": -42.379486083984375, "loss": 1.0619, "rewards/accuracies": 0.0, "rewards/chosen": 2.407701253890991, "rewards/margins": -0.49434566497802734, "rewards/rejected": 2.9020469188690186, "step": 306 }, { "epoch": 0.07, "learning_rate": 9.999606481269841e-06, "logits/chosen": -0.9738254547119141, "logits/rejected": -0.8809133768081665, "logps/chosen": -55.50828552246094, "logps/rejected": -65.23770904541016, "loss": 0.3667, "rewards/accuracies": 1.0, "rewards/chosen": 1.6693207025527954, "rewards/margins": 0.7439506649971008, "rewards/rejected": 0.9253700375556946, "step": 307 }, { "epoch": 0.07, "learning_rate": 9.999583673562006e-06, "logits/chosen": -1.1557424068450928, "logits/rejected": -1.153125524520874, "logps/chosen": -63.64034652709961, "logps/rejected": -60.652809143066406, "loss": 0.1064, "rewards/accuracies": 1.0, "rewards/chosen": 3.4951159954071045, "rewards/margins": 1.7544047832489014, "rewards/rejected": 1.7407112121582031, "step": 308 }, { "epoch": 0.07, "learning_rate": 9.999560223419687e-06, "logits/chosen": -1.564944863319397, "logits/rejected": -1.413487434387207, "logps/chosen": -184.39523315429688, "logps/rejected": -83.28438568115234, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 4.930596828460693, "rewards/margins": 0.5865864753723145, "rewards/rejected": 4.344010353088379, "step": 309 }, { "epoch": 0.07, "learning_rate": 9.999536130845897e-06, "logits/chosen": -0.8631553053855896, "logits/rejected": -0.7333035469055176, "logps/chosen": -70.87551879882812, "logps/rejected": -31.39008331298828, "loss": 0.0993, "rewards/accuracies": 1.0, "rewards/chosen": 1.9856071472167969, "rewards/margins": 1.8837906122207642, "rewards/rejected": 0.1018165573477745, "step": 310 }, { "epoch": 0.07, "learning_rate": 9.999511395843734e-06, "logits/chosen": -1.1591826677322388, "logits/rejected": -1.1635966300964355, "logps/chosen": -43.71396255493164, "logps/rejected": -66.26702880859375, "loss": 0.4194, "rewards/accuracies": 0.0, "rewards/chosen": 2.4277637004852295, "rewards/margins": -0.2501637935638428, "rewards/rejected": 2.6779274940490723, "step": 311 }, { "epoch": 0.07, "learning_rate": 9.999486018416375e-06, "logits/chosen": -1.2088987827301025, "logits/rejected": -1.1786772012710571, "logps/chosen": -67.6808090209961, "logps/rejected": -56.14152145385742, "loss": 1.7608, "rewards/accuracies": 0.0, "rewards/chosen": 2.2355072498321533, "rewards/margins": -1.3901209831237793, "rewards/rejected": 3.6256282329559326, "step": 312 }, { "epoch": 0.07, "learning_rate": 9.99945999856708e-06, "logits/chosen": -1.2003034353256226, "logits/rejected": -1.08482027053833, "logps/chosen": -188.36383056640625, "logps/rejected": -158.99258422851562, "loss": 0.6106, "rewards/accuracies": 1.0, "rewards/chosen": 6.578770637512207, "rewards/margins": 0.21114683151245117, "rewards/rejected": 6.367623805999756, "step": 313 }, { "epoch": 0.07, "learning_rate": 9.999433336299195e-06, "logits/chosen": -0.9907497763633728, "logits/rejected": -1.1288082599639893, "logps/chosen": -69.93970489501953, "logps/rejected": -153.2227020263672, "loss": 2.9278, "rewards/accuracies": 0.0, "rewards/chosen": 2.5871238708496094, "rewards/margins": -5.704440116882324, "rewards/rejected": 8.291563987731934, "step": 314 }, { "epoch": 0.07, "learning_rate": 9.999406031616143e-06, "logits/chosen": -0.7905613780021667, "logits/rejected": -0.6602346897125244, "logps/chosen": -72.888671875, "logps/rejected": -60.992340087890625, "loss": 0.4481, "rewards/accuracies": 0.0, "rewards/chosen": 1.3770935535430908, "rewards/margins": -0.31947779655456543, "rewards/rejected": 1.6965713500976562, "step": 315 }, { "epoch": 0.07, "learning_rate": 9.999378084521436e-06, "logits/chosen": -0.8348810076713562, "logits/rejected": -0.5916063189506531, "logps/chosen": -65.68511962890625, "logps/rejected": -22.580644607543945, "loss": 0.0834, "rewards/accuracies": 1.0, "rewards/chosen": 1.8045395612716675, "rewards/margins": 1.7086708545684814, "rewards/rejected": 0.09586868435144424, "step": 316 }, { "epoch": 0.07, "learning_rate": 9.999349495018662e-06, "logits/chosen": -1.000367283821106, "logits/rejected": -0.9441484808921814, "logps/chosen": -39.25588607788086, "logps/rejected": -21.925615310668945, "loss": 0.4981, "rewards/accuracies": 1.0, "rewards/chosen": 2.7167797088623047, "rewards/margins": 1.9713068008422852, "rewards/rejected": 0.7454729080200195, "step": 317 }, { "epoch": 0.07, "learning_rate": 9.999320263111495e-06, "logits/chosen": -1.0365684032440186, "logits/rejected": -1.0185284614562988, "logps/chosen": -143.3319091796875, "logps/rejected": -98.28471374511719, "loss": 0.5404, "rewards/accuracies": 0.0, "rewards/chosen": 5.392744541168213, "rewards/margins": -0.6641297340393066, "rewards/rejected": 6.0568742752075195, "step": 318 }, { "epoch": 0.07, "learning_rate": 9.999290388803695e-06, "logits/chosen": -0.687271773815155, "logits/rejected": -0.687271773815155, "logps/chosen": -66.26649475097656, "logps/rejected": -66.26649475097656, "loss": 0.3711, "rewards/accuracies": 0.0, "rewards/chosen": 2.734570264816284, "rewards/margins": 0.0, "rewards/rejected": 2.734570264816284, "step": 319 }, { "epoch": 0.07, "learning_rate": 9.999259872099095e-06, "logits/chosen": -1.0799572467803955, "logits/rejected": -0.8797094225883484, "logps/chosen": -132.66787719726562, "logps/rejected": -68.08845520019531, "loss": 0.1251, "rewards/accuracies": 1.0, "rewards/chosen": 4.97407865524292, "rewards/margins": 2.151479482650757, "rewards/rejected": 2.822599172592163, "step": 320 }, { "epoch": 0.07, "learning_rate": 9.999228713001622e-06, "logits/chosen": -1.040118932723999, "logits/rejected": -0.962648332118988, "logps/chosen": -85.36055755615234, "logps/rejected": -83.85610961914062, "loss": 0.1308, "rewards/accuracies": 1.0, "rewards/chosen": 4.817122936248779, "rewards/margins": 1.2183642387390137, "rewards/rejected": 3.5987586975097656, "step": 321 }, { "epoch": 0.07, "learning_rate": 9.999196911515277e-06, "logits/chosen": -0.7232868671417236, "logits/rejected": -0.7564231753349304, "logps/chosen": -92.0240478515625, "logps/rejected": -87.3255615234375, "loss": 1.9385, "rewards/accuracies": 0.0, "rewards/chosen": 1.1889809370040894, "rewards/margins": -3.7762198448181152, "rewards/rejected": 4.965200901031494, "step": 322 }, { "epoch": 0.07, "learning_rate": 9.999164467644146e-06, "logits/chosen": -1.039304494857788, "logits/rejected": -1.0715240240097046, "logps/chosen": -134.28359985351562, "logps/rejected": -118.25643920898438, "loss": 0.3936, "rewards/accuracies": 0.0, "rewards/chosen": 5.1119585037231445, "rewards/margins": -0.15131521224975586, "rewards/rejected": 5.2632737159729, "step": 323 }, { "epoch": 0.07, "learning_rate": 9.999131381392397e-06, "logits/chosen": -0.9906657934188843, "logits/rejected": -0.9442919492721558, "logps/chosen": -69.48387908935547, "logps/rejected": -67.49732208251953, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": 5.056676387786865, "rewards/margins": 4.220420837402344, "rewards/rejected": 0.836255669593811, "step": 324 }, { "epoch": 0.07, "learning_rate": 9.999097652764285e-06, "logits/chosen": -0.6052626967430115, "logits/rejected": -0.6052626967430115, "logps/chosen": -75.66345977783203, "logps/rejected": -75.66345977783203, "loss": 0.419, "rewards/accuracies": 0.0, "rewards/chosen": 2.3046271800994873, "rewards/margins": 0.0, "rewards/rejected": 2.3046271800994873, "step": 325 }, { "epoch": 0.07, "learning_rate": 9.999063281764142e-06, "logits/chosen": -0.8535366654396057, "logits/rejected": -0.859738290309906, "logps/chosen": -75.73491668701172, "logps/rejected": -79.5692138671875, "loss": 1.4157, "rewards/accuracies": 0.0, "rewards/chosen": 1.7169288396835327, "rewards/margins": -1.7017449140548706, "rewards/rejected": 3.4186737537384033, "step": 326 }, { "epoch": 0.07, "learning_rate": 9.999028268396384e-06, "logits/chosen": -1.125925064086914, "logits/rejected": -1.070733904838562, "logps/chosen": -52.35368347167969, "logps/rejected": -101.44523620605469, "loss": 1.4181, "rewards/accuracies": 0.0, "rewards/chosen": 6.080619812011719, "rewards/margins": -2.2038068771362305, "rewards/rejected": 8.28442668914795, "step": 327 }, { "epoch": 0.07, "learning_rate": 9.99899261266551e-06, "logits/chosen": -0.8129591345787048, "logits/rejected": -0.8129591345787048, "logps/chosen": -66.73182678222656, "logps/rejected": -66.73182678222656, "loss": 0.5775, "rewards/accuracies": 0.0, "rewards/chosen": 1.6811050176620483, "rewards/margins": 0.0, "rewards/rejected": 1.6811050176620483, "step": 328 }, { "epoch": 0.07, "learning_rate": 9.998956314576105e-06, "logits/chosen": -0.8530085682868958, "logits/rejected": -0.9155008792877197, "logps/chosen": -76.44977569580078, "logps/rejected": -95.04679870605469, "loss": 1.9638, "rewards/accuracies": 0.0, "rewards/chosen": 2.1768805980682373, "rewards/margins": -2.540217161178589, "rewards/rejected": 4.717097759246826, "step": 329 }, { "epoch": 0.07, "learning_rate": 9.998919374132829e-06, "logits/chosen": -0.808884859085083, "logits/rejected": -0.5666200518608093, "logps/chosen": -77.44657897949219, "logps/rejected": -15.482958793640137, "loss": 0.1176, "rewards/accuracies": 1.0, "rewards/chosen": 2.1255974769592285, "rewards/margins": 1.429636001586914, "rewards/rejected": 0.6959614753723145, "step": 330 }, { "epoch": 0.07, "learning_rate": 9.99888179134043e-06, "logits/chosen": -1.008626937866211, "logits/rejected": -1.083884358406067, "logps/chosen": -53.75205612182617, "logps/rejected": -98.14128875732422, "loss": 1.4341, "rewards/accuracies": 0.0, "rewards/chosen": 3.6708552837371826, "rewards/margins": -2.7253172397613525, "rewards/rejected": 6.396172523498535, "step": 331 }, { "epoch": 0.07, "learning_rate": 9.99884356620374e-06, "logits/chosen": -0.7251549363136292, "logits/rejected": -0.6483780145645142, "logps/chosen": -43.973854064941406, "logps/rejected": -30.069534301757812, "loss": 0.9391, "rewards/accuracies": 1.0, "rewards/chosen": 2.2231194972991943, "rewards/margins": 0.15882420539855957, "rewards/rejected": 2.0642952919006348, "step": 332 }, { "epoch": 0.07, "learning_rate": 9.998804698727667e-06, "logits/chosen": -1.1854784488677979, "logits/rejected": -1.0582292079925537, "logps/chosen": -82.08699035644531, "logps/rejected": -89.94803619384766, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 4.733798503875732, "rewards/margins": 2.662813901901245, "rewards/rejected": 2.0709846019744873, "step": 333 }, { "epoch": 0.07, "learning_rate": 9.998765188917206e-06, "logits/chosen": -0.9747633934020996, "logits/rejected": -1.1449624300003052, "logps/chosen": -126.86227416992188, "logps/rejected": -48.691734313964844, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": 5.486371040344238, "rewards/margins": 4.768249034881592, "rewards/rejected": 0.718122124671936, "step": 334 }, { "epoch": 0.07, "learning_rate": 9.998725036777437e-06, "logits/chosen": -1.243388056755066, "logits/rejected": -1.6945875883102417, "logps/chosen": -75.01241302490234, "logps/rejected": -111.53048706054688, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 3.90683913230896, "rewards/margins": 0.13785266876220703, "rewards/rejected": 3.768986463546753, "step": 335 }, { "epoch": 0.07, "learning_rate": 9.998684242313516e-06, "logits/chosen": -1.131987452507019, "logits/rejected": -1.1510649919509888, "logps/chosen": -58.562313079833984, "logps/rejected": -67.77495574951172, "loss": 0.2144, "rewards/accuracies": 1.0, "rewards/chosen": 2.1656720638275146, "rewards/margins": 0.6740689277648926, "rewards/rejected": 1.491603136062622, "step": 336 }, { "epoch": 0.07, "learning_rate": 9.998642805530687e-06, "logits/chosen": -0.46801501512527466, "logits/rejected": -0.3611673414707184, "logps/chosen": -21.534547805786133, "logps/rejected": -17.5056095123291, "loss": 0.88, "rewards/accuracies": 1.0, "rewards/chosen": 1.4349291324615479, "rewards/margins": 1.0964117050170898, "rewards/rejected": 0.3385173976421356, "step": 337 }, { "epoch": 0.07, "learning_rate": 9.998600726434274e-06, "logits/chosen": -1.2581486701965332, "logits/rejected": -1.1992298364639282, "logps/chosen": -46.87451934814453, "logps/rejected": -93.46342468261719, "loss": 0.7225, "rewards/accuracies": 0.0, "rewards/chosen": 1.5732173919677734, "rewards/margins": -0.7250797748565674, "rewards/rejected": 2.298297166824341, "step": 338 }, { "epoch": 0.08, "learning_rate": 9.998558005029685e-06, "logits/chosen": -0.8464296460151672, "logits/rejected": -0.783912181854248, "logps/chosen": -19.502010345458984, "logps/rejected": -72.39167785644531, "loss": 1.6463, "rewards/accuracies": 1.0, "rewards/chosen": 2.0501046180725098, "rewards/margins": 0.0987480878829956, "rewards/rejected": 1.9513565301895142, "step": 339 }, { "epoch": 0.08, "learning_rate": 9.998514641322406e-06, "logits/chosen": -1.4211527109146118, "logits/rejected": -1.196468710899353, "logps/chosen": -139.10787963867188, "logps/rejected": -55.532562255859375, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 5.142983913421631, "rewards/margins": 3.6983885765075684, "rewards/rejected": 1.4445953369140625, "step": 340 }, { "epoch": 0.08, "learning_rate": 9.998470635318015e-06, "logits/chosen": -1.5568993091583252, "logits/rejected": -1.386373519897461, "logps/chosen": -94.37802124023438, "logps/rejected": -147.62869262695312, "loss": 1.1219, "rewards/accuracies": 0.0, "rewards/chosen": 6.325682163238525, "rewards/margins": -2.1143813133239746, "rewards/rejected": 8.4400634765625, "step": 341 }, { "epoch": 0.08, "learning_rate": 9.99842598702216e-06, "logits/chosen": -1.0044142007827759, "logits/rejected": -0.8407231569290161, "logps/chosen": -94.530517578125, "logps/rejected": -73.89633178710938, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 5.160912990570068, "rewards/margins": 0.9594082832336426, "rewards/rejected": 4.201504707336426, "step": 342 }, { "epoch": 0.08, "learning_rate": 9.998380696440582e-06, "logits/chosen": -0.8759835958480835, "logits/rejected": -0.6204996109008789, "logps/chosen": -59.73435592651367, "logps/rejected": -47.71259689331055, "loss": 0.2802, "rewards/accuracies": 1.0, "rewards/chosen": 1.9564884901046753, "rewards/margins": 0.7635433673858643, "rewards/rejected": 1.192945122718811, "step": 343 }, { "epoch": 0.08, "learning_rate": 9.998334763579103e-06, "logits/chosen": -1.4530534744262695, "logits/rejected": -1.3634155988693237, "logps/chosen": -85.10334777832031, "logps/rejected": -74.61178588867188, "loss": 1.479, "rewards/accuracies": 0.0, "rewards/chosen": 3.7484703063964844, "rewards/margins": -1.3651323318481445, "rewards/rejected": 5.113602638244629, "step": 344 }, { "epoch": 0.08, "learning_rate": 9.998288188443619e-06, "logits/chosen": -0.6110045909881592, "logits/rejected": -0.4434454143047333, "logps/chosen": -96.06806945800781, "logps/rejected": -29.85874366760254, "loss": 0.582, "rewards/accuracies": 1.0, "rewards/chosen": 1.7676658630371094, "rewards/margins": 1.1626439094543457, "rewards/rejected": 0.6050218939781189, "step": 345 }, { "epoch": 0.08, "learning_rate": 9.99824097104012e-06, "logits/chosen": -1.2065720558166504, "logits/rejected": -1.0479384660720825, "logps/chosen": -91.76679229736328, "logps/rejected": -108.60774993896484, "loss": 0.3259, "rewards/accuracies": 1.0, "rewards/chosen": 4.5287699699401855, "rewards/margins": 0.15179157257080078, "rewards/rejected": 4.376978397369385, "step": 346 }, { "epoch": 0.08, "learning_rate": 9.998193111374673e-06, "logits/chosen": -1.1326594352722168, "logits/rejected": -1.1252412796020508, "logps/chosen": -44.34740447998047, "logps/rejected": -48.89411163330078, "loss": 0.26, "rewards/accuracies": 1.0, "rewards/chosen": 2.1146438121795654, "rewards/margins": 0.39883947372436523, "rewards/rejected": 1.7158043384552002, "step": 347 }, { "epoch": 0.08, "learning_rate": 9.998144609453425e-06, "logits/chosen": -1.1972297430038452, "logits/rejected": -1.2103843688964844, "logps/chosen": -44.32006072998047, "logps/rejected": -39.27532196044922, "loss": 0.8679, "rewards/accuracies": 1.0, "rewards/chosen": 3.2542335987091064, "rewards/margins": 0.1839156150817871, "rewards/rejected": 3.0703179836273193, "step": 348 }, { "epoch": 0.08, "learning_rate": 9.99809546528261e-06, "logits/chosen": -1.131809949874878, "logits/rejected": -1.0174583196640015, "logps/chosen": -44.657569885253906, "logps/rejected": -60.80500030517578, "loss": 0.1654, "rewards/accuracies": 1.0, "rewards/chosen": 2.315207004547119, "rewards/margins": 0.9380584955215454, "rewards/rejected": 1.3771485090255737, "step": 349 }, { "epoch": 0.08, "learning_rate": 9.998045678868541e-06, "logits/chosen": -1.0284146070480347, "logits/rejected": -0.8895039558410645, "logps/chosen": -62.03324890136719, "logps/rejected": -34.3525390625, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": 5.077370643615723, "rewards/margins": 3.656522512435913, "rewards/rejected": 1.4208481311798096, "step": 350 }, { "epoch": 0.08, "learning_rate": 9.99799525021762e-06, "logits/chosen": -1.0095549821853638, "logits/rejected": -1.0922882556915283, "logps/chosen": -97.92132568359375, "logps/rejected": -71.92364501953125, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 6.169232368469238, "rewards/margins": 4.919851779937744, "rewards/rejected": 1.2493804693222046, "step": 351 }, { "epoch": 0.08, "learning_rate": 9.997944179336323e-06, "logits/chosen": -1.1794549226760864, "logits/rejected": -1.1727604866027832, "logps/chosen": -66.39764404296875, "logps/rejected": -76.51919555664062, "loss": 1.1227, "rewards/accuracies": 1.0, "rewards/chosen": 2.68733286857605, "rewards/margins": 0.5630271434783936, "rewards/rejected": 2.1243057250976562, "step": 352 }, { "epoch": 0.08, "learning_rate": 9.997892466231215e-06, "logits/chosen": -1.029460072517395, "logits/rejected": -1.029460072517395, "logps/chosen": -26.617698669433594, "logps/rejected": -26.617698669433594, "loss": 0.8364, "rewards/accuracies": 0.0, "rewards/chosen": 2.019848585128784, "rewards/margins": 0.0, "rewards/rejected": 2.019848585128784, "step": 353 }, { "epoch": 0.08, "learning_rate": 9.997840110908938e-06, "logits/chosen": -1.1138170957565308, "logits/rejected": -1.0547810792922974, "logps/chosen": -113.95894622802734, "logps/rejected": -87.7946548461914, "loss": 0.1151, "rewards/accuracies": 1.0, "rewards/chosen": 5.4420294761657715, "rewards/margins": 1.54915452003479, "rewards/rejected": 3.8928749561309814, "step": 354 }, { "epoch": 0.08, "learning_rate": 9.997787113376223e-06, "logits/chosen": -1.085853934288025, "logits/rejected": -0.8167205452919006, "logps/chosen": -107.70523834228516, "logps/rejected": -69.96115112304688, "loss": 0.1297, "rewards/accuracies": 1.0, "rewards/chosen": 4.301930904388428, "rewards/margins": 2.1686322689056396, "rewards/rejected": 2.133298635482788, "step": 355 }, { "epoch": 0.08, "learning_rate": 9.997733473639876e-06, "logits/chosen": -1.2042514085769653, "logits/rejected": -1.1595762968063354, "logps/chosen": -59.051204681396484, "logps/rejected": -51.20330810546875, "loss": 1.638, "rewards/accuracies": 0.0, "rewards/chosen": 2.405597448348999, "rewards/margins": -1.1231966018676758, "rewards/rejected": 3.528794050216675, "step": 356 }, { "epoch": 0.08, "learning_rate": 9.997679191706794e-06, "logits/chosen": -1.1129052639007568, "logits/rejected": -1.0734686851501465, "logps/chosen": -66.33561706542969, "logps/rejected": -45.75677490234375, "loss": 2.7823, "rewards/accuracies": 1.0, "rewards/chosen": 3.7887794971466064, "rewards/margins": 1.4813270568847656, "rewards/rejected": 2.307452440261841, "step": 357 }, { "epoch": 0.08, "learning_rate": 9.99762426758395e-06, "logits/chosen": -1.191640853881836, "logits/rejected": -1.181575894355774, "logps/chosen": -97.39994812011719, "logps/rejected": -66.73622131347656, "loss": 0.7242, "rewards/accuracies": 0.0, "rewards/chosen": 1.6751121282577515, "rewards/margins": -1.1381629705429077, "rewards/rejected": 2.813275098800659, "step": 358 }, { "epoch": 0.08, "learning_rate": 9.997568701278399e-06, "logits/chosen": -1.1462948322296143, "logits/rejected": -1.0369731187820435, "logps/chosen": -169.77508544921875, "logps/rejected": -90.07147216796875, "loss": 0.1318, "rewards/accuracies": 1.0, "rewards/chosen": 7.062770366668701, "rewards/margins": 1.8812274932861328, "rewards/rejected": 5.181542873382568, "step": 359 }, { "epoch": 0.08, "learning_rate": 9.997512492797285e-06, "logits/chosen": -0.7705750465393066, "logits/rejected": -0.7333444356918335, "logps/chosen": -58.073543548583984, "logps/rejected": -74.03851318359375, "loss": 2.5028, "rewards/accuracies": 0.0, "rewards/chosen": 2.5562846660614014, "rewards/margins": -0.6183445453643799, "rewards/rejected": 3.1746292114257812, "step": 360 }, { "epoch": 0.08, "learning_rate": 9.997455642147831e-06, "logits/chosen": -1.340652585029602, "logits/rejected": -1.2311798334121704, "logps/chosen": -142.41932678222656, "logps/rejected": -131.0259552001953, "loss": 1.2373, "rewards/accuracies": 0.0, "rewards/chosen": 6.206962585449219, "rewards/margins": -2.381296157836914, "rewards/rejected": 8.588258743286133, "step": 361 }, { "epoch": 0.08, "learning_rate": 9.997398149337338e-06, "logits/chosen": -1.2158104181289673, "logits/rejected": -1.164663314819336, "logps/chosen": -170.22283935546875, "logps/rejected": -115.76292419433594, "loss": 1.7172, "rewards/accuracies": 1.0, "rewards/chosen": 6.010922431945801, "rewards/margins": 2.588571310043335, "rewards/rejected": 3.422351121902466, "step": 362 }, { "epoch": 0.08, "learning_rate": 9.997340014373198e-06, "logits/chosen": -0.9438089728355408, "logits/rejected": -0.6828227639198303, "logps/chosen": -55.83612060546875, "logps/rejected": -48.66030502319336, "loss": 0.1352, "rewards/accuracies": 1.0, "rewards/chosen": 4.076343536376953, "rewards/margins": 2.1282787322998047, "rewards/rejected": 1.9480648040771484, "step": 363 }, { "epoch": 0.08, "learning_rate": 9.99728123726288e-06, "logits/chosen": -0.8739621043205261, "logits/rejected": -0.60212242603302, "logps/chosen": -91.54925537109375, "logps/rejected": -53.93280792236328, "loss": 0.4792, "rewards/accuracies": 1.0, "rewards/chosen": 6.281381130218506, "rewards/margins": 4.1949663162231445, "rewards/rejected": 2.0864150524139404, "step": 364 }, { "epoch": 0.08, "learning_rate": 9.997221818013933e-06, "logits/chosen": -0.7349117994308472, "logits/rejected": -0.5767457485198975, "logps/chosen": -50.957794189453125, "logps/rejected": -58.362152099609375, "loss": 0.8144, "rewards/accuracies": 0.0, "rewards/chosen": 1.9754456281661987, "rewards/margins": -0.16586005687713623, "rewards/rejected": 2.141305685043335, "step": 365 }, { "epoch": 0.08, "learning_rate": 9.997161756633998e-06, "logits/chosen": -1.0288249254226685, "logits/rejected": -1.059548258781433, "logps/chosen": -58.46907043457031, "logps/rejected": -42.67140197753906, "loss": 0.362, "rewards/accuracies": 1.0, "rewards/chosen": 3.02152419090271, "rewards/margins": 0.1820542812347412, "rewards/rejected": 2.8394699096679688, "step": 366 }, { "epoch": 0.08, "learning_rate": 9.99710105313079e-06, "logits/chosen": -0.7815724611282349, "logits/rejected": -0.752529501914978, "logps/chosen": -39.551513671875, "logps/rejected": -54.342132568359375, "loss": 0.771, "rewards/accuracies": 1.0, "rewards/chosen": 2.2782821655273438, "rewards/margins": 0.20049595832824707, "rewards/rejected": 2.0777862071990967, "step": 367 }, { "epoch": 0.08, "learning_rate": 9.997039707512109e-06, "logits/chosen": -0.9970613121986389, "logits/rejected": -0.7971800565719604, "logps/chosen": -83.68716430664062, "logps/rejected": -18.65874481201172, "loss": 0.0756, "rewards/accuracies": 1.0, "rewards/chosen": 2.3076171875, "rewards/margins": 2.271885395050049, "rewards/rejected": 0.03573188930749893, "step": 368 }, { "epoch": 0.08, "learning_rate": 9.996977719785837e-06, "logits/chosen": -0.9092705845832825, "logits/rejected": -0.8166900277137756, "logps/chosen": -114.39994049072266, "logps/rejected": -26.67734718322754, "loss": 0.0741, "rewards/accuracies": 1.0, "rewards/chosen": 1.875405192375183, "rewards/margins": 1.8375036716461182, "rewards/rejected": 0.03790149837732315, "step": 369 }, { "epoch": 0.08, "learning_rate": 9.996915089959942e-06, "logits/chosen": -0.7217095494270325, "logits/rejected": -0.601789653301239, "logps/chosen": -154.73699951171875, "logps/rejected": -76.37847900390625, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": 6.86041259765625, "rewards/margins": 3.1509032249450684, "rewards/rejected": 3.7095093727111816, "step": 370 }, { "epoch": 0.08, "learning_rate": 9.99685181804247e-06, "logits/chosen": -1.0723515748977661, "logits/rejected": -1.0134449005126953, "logps/chosen": -42.968658447265625, "logps/rejected": -50.369693756103516, "loss": 0.527, "rewards/accuracies": 0.0, "rewards/chosen": 1.4104034900665283, "rewards/margins": -0.023068547248840332, "rewards/rejected": 1.4334720373153687, "step": 371 }, { "epoch": 0.08, "learning_rate": 9.996787904041551e-06, "logits/chosen": -0.9280391931533813, "logits/rejected": -0.8286060094833374, "logps/chosen": -79.05819702148438, "logps/rejected": -95.24681091308594, "loss": 0.7321, "rewards/accuracies": 0.0, "rewards/chosen": 3.193110704421997, "rewards/margins": -1.1755964756011963, "rewards/rejected": 4.368707180023193, "step": 372 }, { "epoch": 0.08, "learning_rate": 9.996723347965399e-06, "logits/chosen": -1.2246544361114502, "logits/rejected": -1.1674221754074097, "logps/chosen": -83.81251525878906, "logps/rejected": -34.38890075683594, "loss": 0.0623, "rewards/accuracies": 1.0, "rewards/chosen": 3.020102024078369, "rewards/margins": 2.631072759628296, "rewards/rejected": 0.389029324054718, "step": 373 }, { "epoch": 0.08, "learning_rate": 9.996658149822307e-06, "logits/chosen": -0.7777931094169617, "logits/rejected": -0.7503339648246765, "logps/chosen": -68.87490844726562, "logps/rejected": -63.16482162475586, "loss": 0.4953, "rewards/accuracies": 0.0, "rewards/chosen": 1.6985787153244019, "rewards/margins": -0.49762070178985596, "rewards/rejected": 2.196199417114258, "step": 374 }, { "epoch": 0.08, "learning_rate": 9.996592309620656e-06, "logits/chosen": -1.0791407823562622, "logits/rejected": -0.8083234429359436, "logps/chosen": -152.86988830566406, "logps/rejected": -29.248291015625, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": 5.898723125457764, "rewards/margins": 5.5242533683776855, "rewards/rejected": 0.3744697570800781, "step": 375 }, { "epoch": 0.08, "learning_rate": 9.996525827368903e-06, "logits/chosen": -1.4031093120574951, "logits/rejected": -1.2657495737075806, "logps/chosen": -72.90148162841797, "logps/rejected": -27.016687393188477, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": 3.177100419998169, "rewards/margins": 2.6011271476745605, "rewards/rejected": 0.5759733319282532, "step": 376 }, { "epoch": 0.08, "learning_rate": 9.996458703075593e-06, "logits/chosen": -0.853143572807312, "logits/rejected": -0.7885818481445312, "logps/chosen": -73.41267395019531, "logps/rejected": -62.23524475097656, "loss": 0.8215, "rewards/accuracies": 0.0, "rewards/chosen": 1.3087753057479858, "rewards/margins": -0.08951115608215332, "rewards/rejected": 1.3982864618301392, "step": 377 }, { "epoch": 0.08, "learning_rate": 9.996390936749351e-06, "logits/chosen": -1.173913598060608, "logits/rejected": -0.998948872089386, "logps/chosen": -64.33345794677734, "logps/rejected": -19.01967430114746, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": 3.8760507106781006, "rewards/margins": 3.496424436569214, "rewards/rejected": 0.3796262741088867, "step": 378 }, { "epoch": 0.08, "learning_rate": 9.996322528398886e-06, "logits/chosen": -0.9423243403434753, "logits/rejected": -0.8063523173332214, "logps/chosen": -146.21466064453125, "logps/rejected": -80.96806335449219, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": 5.691870212554932, "rewards/margins": 2.889265537261963, "rewards/rejected": 2.8026046752929688, "step": 379 }, { "epoch": 0.08, "learning_rate": 9.996253478032987e-06, "logits/chosen": -1.0108728408813477, "logits/rejected": -0.9311748743057251, "logps/chosen": -92.06964111328125, "logps/rejected": -49.19404602050781, "loss": 0.5478, "rewards/accuracies": 0.0, "rewards/chosen": 2.7547767162323, "rewards/margins": -0.16513681411743164, "rewards/rejected": 2.9199135303497314, "step": 380 }, { "epoch": 0.08, "learning_rate": 9.996183785660526e-06, "logits/chosen": -0.9492632150650024, "logits/rejected": -1.0817434787750244, "logps/chosen": -68.09960174560547, "logps/rejected": -24.650066375732422, "loss": 0.4346, "rewards/accuracies": 1.0, "rewards/chosen": 2.426001787185669, "rewards/margins": 1.5038620233535767, "rewards/rejected": 0.9221397638320923, "step": 381 }, { "epoch": 0.08, "learning_rate": 9.996113451290457e-06, "logits/chosen": -0.5845862627029419, "logits/rejected": -0.6672322750091553, "logps/chosen": -29.32029914855957, "logps/rejected": -74.50837707519531, "loss": 0.9803, "rewards/accuracies": 0.0, "rewards/chosen": -0.00267200474627316, "rewards/margins": -1.768509864807129, "rewards/rejected": 1.7658379077911377, "step": 382 }, { "epoch": 0.08, "learning_rate": 9.996042474931821e-06, "logits/chosen": -0.9688534736633301, "logits/rejected": -0.9678943157196045, "logps/chosen": -40.91279602050781, "logps/rejected": -61.69241714477539, "loss": 0.8948, "rewards/accuracies": 0.0, "rewards/chosen": 2.7797317504882812, "rewards/margins": -0.1622607707977295, "rewards/rejected": 2.9419925212860107, "step": 383 }, { "epoch": 0.08, "learning_rate": 9.995970856593739e-06, "logits/chosen": -0.8233112692832947, "logits/rejected": -0.8094332814216614, "logps/chosen": -57.09180450439453, "logps/rejected": -72.15693664550781, "loss": 0.6749, "rewards/accuracies": 0.0, "rewards/chosen": 2.9839560985565186, "rewards/margins": -0.9083306789398193, "rewards/rejected": 3.892286777496338, "step": 384 }, { "epoch": 0.09, "learning_rate": 9.99589859628541e-06, "logits/chosen": -0.6621935963630676, "logits/rejected": -0.7047179937362671, "logps/chosen": -19.704120635986328, "logps/rejected": -50.275634765625, "loss": 1.0476, "rewards/accuracies": 0.0, "rewards/chosen": 0.562782883644104, "rewards/margins": -1.6210931539535522, "rewards/rejected": 2.1838760375976562, "step": 385 }, { "epoch": 0.09, "learning_rate": 9.995825694016122e-06, "logits/chosen": -0.8357635140419006, "logits/rejected": -0.9336518049240112, "logps/chosen": -64.35545349121094, "logps/rejected": -189.44961547851562, "loss": 4.9226, "rewards/accuracies": 0.0, "rewards/chosen": 2.061166524887085, "rewards/margins": -4.768536567687988, "rewards/rejected": 6.829702854156494, "step": 386 }, { "epoch": 0.09, "learning_rate": 9.995752149795241e-06, "logits/chosen": -1.3173155784606934, "logits/rejected": -1.1538640260696411, "logps/chosen": -69.47480773925781, "logps/rejected": -21.611480712890625, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 2.7083663940429688, "rewards/margins": 2.0933737754821777, "rewards/rejected": 0.6149925589561462, "step": 387 }, { "epoch": 0.09, "learning_rate": 9.99567796363222e-06, "logits/chosen": -0.9001544117927551, "logits/rejected": -0.905676007270813, "logps/chosen": -175.0158233642578, "logps/rejected": -140.07948303222656, "loss": 0.3334, "rewards/accuracies": 1.0, "rewards/chosen": 7.28765869140625, "rewards/margins": 0.1672835350036621, "rewards/rejected": 7.120375156402588, "step": 388 }, { "epoch": 0.09, "learning_rate": 9.995603135536587e-06, "logits/chosen": -1.1952651739120483, "logits/rejected": -1.2233600616455078, "logps/chosen": -46.04562759399414, "logps/rejected": -85.4742660522461, "loss": 0.4084, "rewards/accuracies": 0.0, "rewards/chosen": 1.9034541845321655, "rewards/margins": -0.20240437984466553, "rewards/rejected": 2.105858564376831, "step": 389 }, { "epoch": 0.09, "learning_rate": 9.995527665517964e-06, "logits/chosen": -0.9687427282333374, "logits/rejected": -0.9478734135627747, "logps/chosen": -39.023555755615234, "logps/rejected": -62.69960021972656, "loss": 0.7498, "rewards/accuracies": 0.0, "rewards/chosen": 1.934686303138733, "rewards/margins": -0.14772498607635498, "rewards/rejected": 2.082411289215088, "step": 390 }, { "epoch": 0.09, "learning_rate": 9.995451553586042e-06, "logits/chosen": -0.7816596031188965, "logits/rejected": -0.9683504700660706, "logps/chosen": -63.917049407958984, "logps/rejected": -63.176414489746094, "loss": 0.4331, "rewards/accuracies": 1.0, "rewards/chosen": 2.3945019245147705, "rewards/margins": 0.11758160591125488, "rewards/rejected": 2.2769203186035156, "step": 391 }, { "epoch": 0.09, "learning_rate": 9.995374799750606e-06, "logits/chosen": -0.840591311454773, "logits/rejected": -0.840591311454773, "logps/chosen": -51.701881408691406, "logps/rejected": -51.701881408691406, "loss": 0.3682, "rewards/accuracies": 0.0, "rewards/chosen": 0.5210247039794922, "rewards/margins": 0.0, "rewards/rejected": 0.5210247039794922, "step": 392 }, { "epoch": 0.09, "learning_rate": 9.995297404021515e-06, "logits/chosen": -1.1140918731689453, "logits/rejected": -1.1764634847640991, "logps/chosen": -83.57429504394531, "logps/rejected": -47.06970977783203, "loss": 0.4122, "rewards/accuracies": 0.0, "rewards/chosen": 3.2020423412323, "rewards/margins": -0.08952498435974121, "rewards/rejected": 3.291567325592041, "step": 393 }, { "epoch": 0.09, "learning_rate": 9.995219366408717e-06, "logits/chosen": -0.9927346706390381, "logits/rejected": -0.910828709602356, "logps/chosen": -73.26206970214844, "logps/rejected": -58.521087646484375, "loss": 0.6001, "rewards/accuracies": 0.0, "rewards/chosen": 1.9410003423690796, "rewards/margins": -0.02884066104888916, "rewards/rejected": 1.9698410034179688, "step": 394 }, { "epoch": 0.09, "learning_rate": 9.995140686922237e-06, "logits/chosen": -0.8965963125228882, "logits/rejected": -0.7790437340736389, "logps/chosen": -57.06755828857422, "logps/rejected": -88.111572265625, "loss": 0.6991, "rewards/accuracies": 0.0, "rewards/chosen": 2.568666934967041, "rewards/margins": -1.0896856784820557, "rewards/rejected": 3.6583526134490967, "step": 395 }, { "epoch": 0.09, "learning_rate": 9.995061365572188e-06, "logits/chosen": -1.250868320465088, "logits/rejected": -1.2603248357772827, "logps/chosen": -49.91297912597656, "logps/rejected": -63.49680709838867, "loss": 0.113, "rewards/accuracies": 1.0, "rewards/chosen": 2.6614837646484375, "rewards/margins": 1.3948085308074951, "rewards/rejected": 1.2666752338409424, "step": 396 }, { "epoch": 0.09, "learning_rate": 9.994981402368763e-06, "logits/chosen": -2.445448637008667, "logits/rejected": -2.371452808380127, "logps/chosen": -110.67996978759766, "logps/rejected": -142.30181884765625, "loss": 2.3843, "rewards/accuracies": 0.0, "rewards/chosen": 2.146416425704956, "rewards/margins": -4.755918502807617, "rewards/rejected": 6.902334690093994, "step": 397 }, { "epoch": 0.09, "learning_rate": 9.994900797322233e-06, "logits/chosen": -1.2052781581878662, "logits/rejected": -1.140387773513794, "logps/chosen": -114.4261474609375, "logps/rejected": -102.17315673828125, "loss": 0.3039, "rewards/accuracies": 1.0, "rewards/chosen": 4.282458782196045, "rewards/margins": 0.21896696090698242, "rewards/rejected": 4.0634918212890625, "step": 398 }, { "epoch": 0.09, "learning_rate": 9.994819550442958e-06, "logits/chosen": -1.2041993141174316, "logits/rejected": -1.2356613874435425, "logps/chosen": -54.70343780517578, "logps/rejected": -71.4947509765625, "loss": 0.9128, "rewards/accuracies": 0.0, "rewards/chosen": 2.986072540283203, "rewards/margins": -0.042917728424072266, "rewards/rejected": 3.0289902687072754, "step": 399 }, { "epoch": 0.09, "learning_rate": 9.994737661741379e-06, "logits/chosen": -1.5086815357208252, "logits/rejected": -1.319426417350769, "logps/chosen": -143.7400360107422, "logps/rejected": -117.16201782226562, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": 5.368400573730469, "rewards/margins": 2.090589761734009, "rewards/rejected": 3.27781081199646, "step": 400 }, { "epoch": 0.09, "learning_rate": 9.994655131228017e-06, "logits/chosen": -0.9797889590263367, "logits/rejected": -0.9831463694572449, "logps/chosen": -41.41899108886719, "logps/rejected": -33.16545486450195, "loss": 0.563, "rewards/accuracies": 1.0, "rewards/chosen": 1.6969234943389893, "rewards/margins": 0.6130951642990112, "rewards/rejected": 1.083828330039978, "step": 401 }, { "epoch": 0.09, "learning_rate": 9.994571958913477e-06, "logits/chosen": -1.1446845531463623, "logits/rejected": -1.1446845531463623, "logps/chosen": -57.91332244873047, "logps/rejected": -57.91332244873047, "loss": 0.3583, "rewards/accuracies": 0.0, "rewards/chosen": 1.8962631225585938, "rewards/margins": 0.0, "rewards/rejected": 1.8962631225585938, "step": 402 }, { "epoch": 0.09, "learning_rate": 9.994488144808449e-06, "logits/chosen": -1.0637084245681763, "logits/rejected": -1.2556850910186768, "logps/chosen": -37.519222259521484, "logps/rejected": -188.866943359375, "loss": 3.001, "rewards/accuracies": 0.0, "rewards/chosen": 1.8109012842178345, "rewards/margins": -5.871648788452148, "rewards/rejected": 7.682549953460693, "step": 403 }, { "epoch": 0.09, "learning_rate": 9.994403688923699e-06, "logits/chosen": -1.0005736351013184, "logits/rejected": -0.9910703301429749, "logps/chosen": -42.84333038330078, "logps/rejected": -102.29814147949219, "loss": 0.3624, "rewards/accuracies": 0.0, "rewards/chosen": 2.302699327468872, "rewards/margins": -0.03800201416015625, "rewards/rejected": 2.3407013416290283, "step": 404 }, { "epoch": 0.09, "learning_rate": 9.994318591270081e-06, "logits/chosen": -1.2830901145935059, "logits/rejected": -1.1668366193771362, "logps/chosen": -68.67174530029297, "logps/rejected": -55.17259979248047, "loss": 0.2604, "rewards/accuracies": 1.0, "rewards/chosen": 1.2025063037872314, "rewards/margins": 0.41632771492004395, "rewards/rejected": 0.7861785888671875, "step": 405 }, { "epoch": 0.09, "learning_rate": 9.99423285185853e-06, "logits/chosen": -0.8395268321037292, "logits/rejected": -0.8687115907669067, "logps/chosen": -34.009281158447266, "logps/rejected": -39.164703369140625, "loss": 1.6187, "rewards/accuracies": 0.0, "rewards/chosen": 1.489121675491333, "rewards/margins": -0.4786578416824341, "rewards/rejected": 1.967779517173767, "step": 406 }, { "epoch": 0.09, "learning_rate": 9.994146470700065e-06, "logits/chosen": -1.3037281036376953, "logits/rejected": -1.418383240699768, "logps/chosen": -34.413185119628906, "logps/rejected": -153.80792236328125, "loss": 2.4222, "rewards/accuracies": 0.0, "rewards/chosen": 1.9469566345214844, "rewards/margins": -3.2710366249084473, "rewards/rejected": 5.217993259429932, "step": 407 }, { "epoch": 0.09, "learning_rate": 9.994059447805781e-06, "logits/chosen": -1.2782924175262451, "logits/rejected": -1.217561960220337, "logps/chosen": -120.32059478759766, "logps/rejected": -121.68245697021484, "loss": 0.3577, "rewards/accuracies": 1.0, "rewards/chosen": 6.30678653717041, "rewards/margins": 0.8836321830749512, "rewards/rejected": 5.423154354095459, "step": 408 }, { "epoch": 0.09, "learning_rate": 9.993971783186867e-06, "logits/chosen": -1.126456379890442, "logits/rejected": -1.0119926929473877, "logps/chosen": -77.31990814208984, "logps/rejected": -72.23600769042969, "loss": 0.3562, "rewards/accuracies": 1.0, "rewards/chosen": 4.398971080780029, "rewards/margins": 1.0234110355377197, "rewards/rejected": 3.3755600452423096, "step": 409 }, { "epoch": 0.09, "learning_rate": 9.993883476854582e-06, "logits/chosen": -1.2378102540969849, "logits/rejected": -1.170257329940796, "logps/chosen": -68.3250503540039, "logps/rejected": -66.75344848632812, "loss": 0.8815, "rewards/accuracies": 0.0, "rewards/chosen": 2.6977927684783936, "rewards/margins": -1.5333473682403564, "rewards/rejected": 4.23114013671875, "step": 410 }, { "epoch": 0.09, "learning_rate": 9.993794528820275e-06, "logits/chosen": -1.3501172065734863, "logits/rejected": -1.26198410987854, "logps/chosen": -45.462913513183594, "logps/rejected": -54.19338607788086, "loss": 0.5809, "rewards/accuracies": 0.0, "rewards/chosen": 2.4247078895568848, "rewards/margins": -0.4657566547393799, "rewards/rejected": 2.8904645442962646, "step": 411 }, { "epoch": 0.09, "learning_rate": 9.993704939095376e-06, "logits/chosen": -0.8259392380714417, "logits/rejected": -0.858603298664093, "logps/chosen": -35.037574768066406, "logps/rejected": -64.90385437011719, "loss": 0.3497, "rewards/accuracies": 1.0, "rewards/chosen": 2.3904731273651123, "rewards/margins": 0.08754181861877441, "rewards/rejected": 2.302931308746338, "step": 412 }, { "epoch": 0.09, "learning_rate": 9.9936147076914e-06, "logits/chosen": -1.0673331022262573, "logits/rejected": -1.0894776582717896, "logps/chosen": -132.66070556640625, "logps/rejected": -126.44377136230469, "loss": 1.2034, "rewards/accuracies": 0.0, "rewards/chosen": 4.225013732910156, "rewards/margins": -2.307852268218994, "rewards/rejected": 6.53286600112915, "step": 413 }, { "epoch": 0.09, "learning_rate": 9.993523834619933e-06, "logits/chosen": -0.9833309650421143, "logits/rejected": -1.1945655345916748, "logps/chosen": -85.07371520996094, "logps/rejected": -177.84005737304688, "loss": 0.4701, "rewards/accuracies": 0.0, "rewards/chosen": 5.650186061859131, "rewards/margins": -0.39942359924316406, "rewards/rejected": 6.049609661102295, "step": 414 }, { "epoch": 0.09, "learning_rate": 9.99343231989266e-06, "logits/chosen": -1.2237918376922607, "logits/rejected": -1.2637990713119507, "logps/chosen": -44.340579986572266, "logps/rejected": -52.910457611083984, "loss": 1.2919, "rewards/accuracies": 0.0, "rewards/chosen": 2.2869985103607178, "rewards/margins": -1.396043300628662, "rewards/rejected": 3.68304181098938, "step": 415 }, { "epoch": 0.09, "learning_rate": 9.99334016352134e-06, "logits/chosen": -1.1299883127212524, "logits/rejected": -1.1542015075683594, "logps/chosen": -66.92642974853516, "logps/rejected": -110.94792175292969, "loss": 0.232, "rewards/accuracies": 1.0, "rewards/chosen": 5.859742164611816, "rewards/margins": 1.2701668739318848, "rewards/rejected": 4.589575290679932, "step": 416 }, { "epoch": 0.09, "learning_rate": 9.993247365517808e-06, "logits/chosen": -1.256980299949646, "logits/rejected": -1.227941870689392, "logps/chosen": -43.477561950683594, "logps/rejected": -37.052852630615234, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": 3.9123008251190186, "rewards/margins": 1.8019084930419922, "rewards/rejected": 2.1103923320770264, "step": 417 }, { "epoch": 0.09, "learning_rate": 9.993153925893997e-06, "logits/chosen": -1.3212008476257324, "logits/rejected": -1.1757242679595947, "logps/chosen": -134.2371063232422, "logps/rejected": -62.0867805480957, "loss": 0.3053, "rewards/accuracies": 1.0, "rewards/chosen": 5.531877040863037, "rewards/margins": 2.8681764602661133, "rewards/rejected": 2.663700580596924, "step": 418 }, { "epoch": 0.09, "learning_rate": 9.993059844661908e-06, "logits/chosen": -1.3528740406036377, "logits/rejected": -1.2765024900436401, "logps/chosen": -122.68148803710938, "logps/rejected": -46.24375915527344, "loss": 0.3051, "rewards/accuracies": 1.0, "rewards/chosen": 6.644952297210693, "rewards/margins": 3.5855023860931396, "rewards/rejected": 3.0594499111175537, "step": 419 }, { "epoch": 0.09, "learning_rate": 9.992965121833631e-06, "logits/chosen": -1.1084623336791992, "logits/rejected": -1.1084623336791992, "logps/chosen": -104.72291564941406, "logps/rejected": -104.72291564941406, "loss": 0.4397, "rewards/accuracies": 0.0, "rewards/chosen": 4.320231914520264, "rewards/margins": 0.0, "rewards/rejected": 4.320231914520264, "step": 420 }, { "epoch": 0.09, "learning_rate": 9.99286975742134e-06, "logits/chosen": -1.012387752532959, "logits/rejected": -1.1794029474258423, "logps/chosen": -66.39193725585938, "logps/rejected": -82.57000732421875, "loss": 1.5897, "rewards/accuracies": 0.0, "rewards/chosen": 2.896881103515625, "rewards/margins": -3.123684883117676, "rewards/rejected": 6.020565986633301, "step": 421 }, { "epoch": 0.09, "learning_rate": 9.992773751437288e-06, "logits/chosen": -1.3756214380264282, "logits/rejected": -1.3064359426498413, "logps/chosen": -145.04739379882812, "logps/rejected": -138.04736328125, "loss": 0.8167, "rewards/accuracies": 0.0, "rewards/chosen": 6.184978008270264, "rewards/margins": -1.4029617309570312, "rewards/rejected": 7.587939739227295, "step": 422 }, { "epoch": 0.09, "learning_rate": 9.99267710389381e-06, "logits/chosen": -1.0549068450927734, "logits/rejected": -0.7245635390281677, "logps/chosen": -119.6545639038086, "logps/rejected": -30.315900802612305, "loss": 0.5563, "rewards/accuracies": 1.0, "rewards/chosen": 5.191346168518066, "rewards/margins": 4.802779197692871, "rewards/rejected": 0.388566792011261, "step": 423 }, { "epoch": 0.09, "learning_rate": 9.992579814803327e-06, "logits/chosen": -0.8991936445236206, "logits/rejected": -0.8991936445236206, "logps/chosen": -27.228445053100586, "logps/rejected": -27.228445053100586, "loss": 0.5294, "rewards/accuracies": 0.0, "rewards/chosen": 1.1552584171295166, "rewards/margins": 0.0, "rewards/rejected": 1.1552584171295166, "step": 424 }, { "epoch": 0.09, "learning_rate": 9.992481884178338e-06, "logits/chosen": -0.75602787733078, "logits/rejected": -0.7337138056755066, "logps/chosen": -67.95777893066406, "logps/rejected": -53.36183547973633, "loss": 0.4358, "rewards/accuracies": 0.0, "rewards/chosen": 1.79521644115448, "rewards/margins": -0.32806670665740967, "rewards/rejected": 2.1232831478118896, "step": 425 }, { "epoch": 0.09, "learning_rate": 9.99238331203143e-06, "logits/chosen": -1.3702294826507568, "logits/rejected": -1.1562025547027588, "logps/chosen": -165.28823852539062, "logps/rejected": -35.36894989013672, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 7.1281418800354, "rewards/margins": 7.008088111877441, "rewards/rejected": 0.12005386501550674, "step": 426 }, { "epoch": 0.09, "learning_rate": 9.99228409837527e-06, "logits/chosen": -1.1643750667572021, "logits/rejected": -1.2385261058807373, "logps/chosen": -39.81855773925781, "logps/rejected": -94.23722076416016, "loss": 1.4804, "rewards/accuracies": 0.0, "rewards/chosen": 2.8982338905334473, "rewards/margins": -2.906968593597412, "rewards/rejected": 5.805202484130859, "step": 427 }, { "epoch": 0.09, "learning_rate": 9.9921842432226e-06, "logits/chosen": -1.252644658088684, "logits/rejected": -1.1147631406784058, "logps/chosen": -98.74085998535156, "logps/rejected": -98.67686462402344, "loss": 0.0808, "rewards/accuracies": 1.0, "rewards/chosen": 6.679579257965088, "rewards/margins": 2.0952682495117188, "rewards/rejected": 4.584311008453369, "step": 428 }, { "epoch": 0.09, "learning_rate": 9.992083746586258e-06, "logits/chosen": -0.9760788679122925, "logits/rejected": -0.8490703701972961, "logps/chosen": -34.649532318115234, "logps/rejected": -59.88387680053711, "loss": 2.1163, "rewards/accuracies": 0.0, "rewards/chosen": 2.0670742988586426, "rewards/margins": -2.027491569519043, "rewards/rejected": 4.0945658683776855, "step": 429 }, { "epoch": 0.1, "learning_rate": 9.991982608479156e-06, "logits/chosen": -1.0419912338256836, "logits/rejected": -0.9431798458099365, "logps/chosen": -81.41801452636719, "logps/rejected": -62.00787353515625, "loss": 0.261, "rewards/accuracies": 1.0, "rewards/chosen": 3.918771505355835, "rewards/margins": 0.9257822036743164, "rewards/rejected": 2.9929893016815186, "step": 430 }, { "epoch": 0.1, "learning_rate": 9.991880828914288e-06, "logits/chosen": -1.2932006120681763, "logits/rejected": -1.3901638984680176, "logps/chosen": -67.53770446777344, "logps/rejected": -92.25234985351562, "loss": 1.606, "rewards/accuracies": 0.0, "rewards/chosen": 3.6714935302734375, "rewards/margins": -2.9908385276794434, "rewards/rejected": 6.662332057952881, "step": 431 }, { "epoch": 0.1, "learning_rate": 9.991778407904733e-06, "logits/chosen": -1.5508724451065063, "logits/rejected": -1.5902926921844482, "logps/chosen": -105.31689453125, "logps/rejected": -90.90852355957031, "loss": 0.1265, "rewards/accuracies": 1.0, "rewards/chosen": 7.187950134277344, "rewards/margins": 3.29439377784729, "rewards/rejected": 3.8935563564300537, "step": 432 }, { "epoch": 0.1, "learning_rate": 9.991675345463654e-06, "logits/chosen": -1.096520185470581, "logits/rejected": -0.7180580496788025, "logps/chosen": -116.34668731689453, "logps/rejected": -57.55339050292969, "loss": 0.4127, "rewards/accuracies": 0.0, "rewards/chosen": 3.5789268016815186, "rewards/margins": -0.1609177589416504, "rewards/rejected": 3.739844560623169, "step": 433 }, { "epoch": 0.1, "learning_rate": 9.991571641604291e-06, "logits/chosen": -1.3545341491699219, "logits/rejected": -1.2141720056533813, "logps/chosen": -78.56146240234375, "logps/rejected": -37.94845199584961, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": 2.0275871753692627, "rewards/margins": 1.8507740497589111, "rewards/rejected": 0.17681312561035156, "step": 434 }, { "epoch": 0.1, "learning_rate": 9.991467296339973e-06, "logits/chosen": -0.9956603646278381, "logits/rejected": -0.9652508497238159, "logps/chosen": -44.26707458496094, "logps/rejected": -45.65003967285156, "loss": 0.3253, "rewards/accuracies": 1.0, "rewards/chosen": 3.543966054916382, "rewards/margins": 0.09231042861938477, "rewards/rejected": 3.451655626296997, "step": 435 }, { "epoch": 0.1, "learning_rate": 9.991362309684105e-06, "logits/chosen": -1.1821752786636353, "logits/rejected": -1.345666527748108, "logps/chosen": -69.90108489990234, "logps/rejected": -88.59563446044922, "loss": 2.8503, "rewards/accuracies": 0.0, "rewards/chosen": 1.0988044738769531, "rewards/margins": -4.413568019866943, "rewards/rejected": 5.5123724937438965, "step": 436 }, { "epoch": 0.1, "learning_rate": 9.991256681650181e-06, "logits/chosen": -1.0185877084732056, "logits/rejected": -1.0483603477478027, "logps/chosen": -68.27198791503906, "logps/rejected": -109.99217224121094, "loss": 1.0003, "rewards/accuracies": 0.0, "rewards/chosen": 0.4869224727153778, "rewards/margins": -1.8360862731933594, "rewards/rejected": 2.3230087757110596, "step": 437 }, { "epoch": 0.1, "learning_rate": 9.99115041225177e-06, "logits/chosen": -1.1766908168792725, "logits/rejected": -0.8050733208656311, "logps/chosen": -90.39063262939453, "logps/rejected": -29.92137908935547, "loss": 0.7212, "rewards/accuracies": 1.0, "rewards/chosen": 6.375953197479248, "rewards/margins": 5.868461608886719, "rewards/rejected": 0.5074917078018188, "step": 438 }, { "epoch": 0.1, "learning_rate": 9.991043501502532e-06, "logits/chosen": -1.0028319358825684, "logits/rejected": -1.0270601511001587, "logps/chosen": -83.09378814697266, "logps/rejected": -100.84490203857422, "loss": 1.2304, "rewards/accuracies": 0.0, "rewards/chosen": 3.010572910308838, "rewards/margins": -2.3563880920410156, "rewards/rejected": 5.3669610023498535, "step": 439 }, { "epoch": 0.1, "learning_rate": 9.9909359494162e-06, "logits/chosen": -1.1709550619125366, "logits/rejected": -1.2170320749282837, "logps/chosen": -82.10446166992188, "logps/rejected": -88.45530700683594, "loss": 0.5384, "rewards/accuracies": 0.0, "rewards/chosen": 5.562045574188232, "rewards/margins": -0.37878847122192383, "rewards/rejected": 5.940834045410156, "step": 440 }, { "epoch": 0.1, "learning_rate": 9.990827756006599e-06, "logits/chosen": -1.157400131225586, "logits/rejected": -1.1373258829116821, "logps/chosen": -78.77189636230469, "logps/rejected": -95.66671752929688, "loss": 1.014, "rewards/accuracies": 0.0, "rewards/chosen": 3.536123752593994, "rewards/margins": -0.2566864490509033, "rewards/rejected": 3.7928102016448975, "step": 441 }, { "epoch": 0.1, "learning_rate": 9.990718921287625e-06, "logits/chosen": -0.6869906783103943, "logits/rejected": -0.6216124296188354, "logps/chosen": -37.854034423828125, "logps/rejected": -15.056750297546387, "loss": 0.1443, "rewards/accuracies": 1.0, "rewards/chosen": 1.6715103387832642, "rewards/margins": 1.136685848236084, "rewards/rejected": 0.5348244905471802, "step": 442 }, { "epoch": 0.1, "learning_rate": 9.99060944527327e-06, "logits/chosen": -1.3269258737564087, "logits/rejected": -1.415663480758667, "logps/chosen": -84.4134521484375, "logps/rejected": -189.8498077392578, "loss": 1.9688, "rewards/accuracies": 0.0, "rewards/chosen": 5.297214031219482, "rewards/margins": -3.872044086456299, "rewards/rejected": 9.169258117675781, "step": 443 }, { "epoch": 0.1, "learning_rate": 9.990499327977599e-06, "logits/chosen": -1.19383704662323, "logits/rejected": -0.9605644941329956, "logps/chosen": -85.04826354980469, "logps/rejected": -32.714420318603516, "loss": 0.1775, "rewards/accuracies": 1.0, "rewards/chosen": 4.771267890930176, "rewards/margins": 4.190685272216797, "rewards/rejected": 0.5805827975273132, "step": 444 }, { "epoch": 0.1, "learning_rate": 9.990388569414759e-06, "logits/chosen": -1.3546775579452515, "logits/rejected": -1.2571176290512085, "logps/chosen": -66.32545471191406, "logps/rejected": -53.159263610839844, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 4.8691864013671875, "rewards/margins": 3.2749900817871094, "rewards/rejected": 1.5941963195800781, "step": 445 }, { "epoch": 0.1, "learning_rate": 9.990277169598985e-06, "logits/chosen": -0.9450092315673828, "logits/rejected": -1.0464507341384888, "logps/chosen": -69.21243286132812, "logps/rejected": -114.02739715576172, "loss": 2.1045, "rewards/accuracies": 0.0, "rewards/chosen": 2.377703905105591, "rewards/margins": -1.6052589416503906, "rewards/rejected": 3.9829628467559814, "step": 446 }, { "epoch": 0.1, "learning_rate": 9.99016512854459e-06, "logits/chosen": -1.1122440099716187, "logits/rejected": -0.9831613302230835, "logps/chosen": -125.08135223388672, "logps/rejected": -48.004150390625, "loss": 0.4681, "rewards/accuracies": 1.0, "rewards/chosen": 6.693772315979004, "rewards/margins": 4.103621482849121, "rewards/rejected": 2.5901505947113037, "step": 447 }, { "epoch": 0.1, "learning_rate": 9.990052446265974e-06, "logits/chosen": -1.1117905378341675, "logits/rejected": -1.0495680570602417, "logps/chosen": -116.87185668945312, "logps/rejected": -76.7201919555664, "loss": 3.2387, "rewards/accuracies": 1.0, "rewards/chosen": 4.26466703414917, "rewards/margins": 0.6711375713348389, "rewards/rejected": 3.593529462814331, "step": 448 }, { "epoch": 0.1, "learning_rate": 9.989939122777614e-06, "logits/chosen": -1.1152451038360596, "logits/rejected": -0.9502503871917725, "logps/chosen": -144.71339416503906, "logps/rejected": -77.88599395751953, "loss": 0.2772, "rewards/accuracies": 1.0, "rewards/chosen": 6.348057746887207, "rewards/margins": 2.751882314682007, "rewards/rejected": 3.5961754322052, "step": 449 }, { "epoch": 0.1, "learning_rate": 9.98982515809407e-06, "logits/chosen": -1.2258877754211426, "logits/rejected": -1.0319254398345947, "logps/chosen": -98.4363021850586, "logps/rejected": -59.02360534667969, "loss": 0.1428, "rewards/accuracies": 1.0, "rewards/chosen": 6.73772668838501, "rewards/margins": 1.2361574172973633, "rewards/rejected": 5.5015692710876465, "step": 450 }, { "epoch": 0.1, "learning_rate": 9.989710552229992e-06, "logits/chosen": -0.9551085233688354, "logits/rejected": -0.9465419054031372, "logps/chosen": -94.30282592773438, "logps/rejected": -87.04264831542969, "loss": 0.4647, "rewards/accuracies": 1.0, "rewards/chosen": 5.53090238571167, "rewards/margins": 1.3227434158325195, "rewards/rejected": 4.20815896987915, "step": 451 }, { "epoch": 0.1, "learning_rate": 9.9895953052001e-06, "logits/chosen": -1.422048807144165, "logits/rejected": -1.2411460876464844, "logps/chosen": -155.25396728515625, "logps/rejected": -151.68777465820312, "loss": 2.7761, "rewards/accuracies": 0.0, "rewards/chosen": 5.1283159255981445, "rewards/margins": -2.6956191062927246, "rewards/rejected": 7.823935031890869, "step": 452 }, { "epoch": 0.1, "learning_rate": 9.989479417019208e-06, "logits/chosen": -0.9495672583580017, "logits/rejected": -0.8213051557540894, "logps/chosen": -106.60887908935547, "logps/rejected": -123.41937255859375, "loss": 0.3883, "rewards/accuracies": 0.0, "rewards/chosen": 3.973482608795166, "rewards/margins": -0.14695215225219727, "rewards/rejected": 4.120434761047363, "step": 453 }, { "epoch": 0.1, "learning_rate": 9.989362887702203e-06, "logits/chosen": -1.365374207496643, "logits/rejected": -1.2743018865585327, "logps/chosen": -133.30831909179688, "logps/rejected": -87.59266662597656, "loss": 0.0813, "rewards/accuracies": 1.0, "rewards/chosen": 6.499269008636475, "rewards/margins": 2.446169853210449, "rewards/rejected": 4.053099155426025, "step": 454 }, { "epoch": 0.1, "learning_rate": 9.989245717264063e-06, "logits/chosen": -1.089322566986084, "logits/rejected": -1.089322566986084, "logps/chosen": -102.29151153564453, "logps/rejected": -102.29151153564453, "loss": 0.4362, "rewards/accuracies": 0.0, "rewards/chosen": 5.9672112464904785, "rewards/margins": 0.0, "rewards/rejected": 5.9672112464904785, "step": 455 }, { "epoch": 0.1, "learning_rate": 9.989127905719841e-06, "logits/chosen": -0.782899022102356, "logits/rejected": -0.8113395571708679, "logps/chosen": -83.82159423828125, "logps/rejected": -119.98607635498047, "loss": 0.7067, "rewards/accuracies": 0.0, "rewards/chosen": 5.0890212059021, "rewards/margins": -1.0305109024047852, "rewards/rejected": 6.119532108306885, "step": 456 }, { "epoch": 0.1, "learning_rate": 9.989009453084678e-06, "logits/chosen": -0.718597948551178, "logits/rejected": -0.718597948551178, "logps/chosen": -47.23222351074219, "logps/rejected": -47.23222351074219, "loss": 0.7196, "rewards/accuracies": 0.0, "rewards/chosen": 1.5215110778808594, "rewards/margins": 0.0, "rewards/rejected": 1.5215110778808594, "step": 457 }, { "epoch": 0.1, "learning_rate": 9.988890359373794e-06, "logits/chosen": -0.8401886224746704, "logits/rejected": -0.8351582288742065, "logps/chosen": -48.8079719543457, "logps/rejected": -53.56035614013672, "loss": 0.5124, "rewards/accuracies": 1.0, "rewards/chosen": 3.0136349201202393, "rewards/margins": 0.26696276664733887, "rewards/rejected": 2.7466721534729004, "step": 458 }, { "epoch": 0.1, "learning_rate": 9.988770624602488e-06, "logits/chosen": -1.3572804927825928, "logits/rejected": -1.3078504800796509, "logps/chosen": -132.53317260742188, "logps/rejected": -89.51166534423828, "loss": 0.1233, "rewards/accuracies": 1.0, "rewards/chosen": 6.267996311187744, "rewards/margins": 3.385357618331909, "rewards/rejected": 2.882638692855835, "step": 459 }, { "epoch": 0.1, "learning_rate": 9.988650248786153e-06, "logits/chosen": -1.1078896522521973, "logits/rejected": -1.0895732641220093, "logps/chosen": -144.8638458251953, "logps/rejected": -193.455810546875, "loss": 1.785, "rewards/accuracies": 0.0, "rewards/chosen": 6.137761116027832, "rewards/margins": -0.6972393989562988, "rewards/rejected": 6.835000514984131, "step": 460 }, { "epoch": 0.1, "learning_rate": 9.988529231940252e-06, "logits/chosen": -0.7899584770202637, "logits/rejected": -1.1480876207351685, "logps/chosen": -68.08258056640625, "logps/rejected": -162.76309204101562, "loss": 1.5548, "rewards/accuracies": 0.0, "rewards/chosen": 2.2686004638671875, "rewards/margins": -3.003736972808838, "rewards/rejected": 5.272337436676025, "step": 461 }, { "epoch": 0.1, "learning_rate": 9.988407574080337e-06, "logits/chosen": -1.2267228364944458, "logits/rejected": -1.288136601448059, "logps/chosen": -69.42446899414062, "logps/rejected": -95.56974792480469, "loss": 1.239, "rewards/accuracies": 0.0, "rewards/chosen": 2.206944227218628, "rewards/margins": -2.3297927379608154, "rewards/rejected": 4.536736965179443, "step": 462 }, { "epoch": 0.1, "learning_rate": 9.988285275222041e-06, "logits/chosen": -0.8970432877540588, "logits/rejected": -0.8358952403068542, "logps/chosen": -81.19258880615234, "logps/rejected": -26.8035945892334, "loss": 0.4908, "rewards/accuracies": 1.0, "rewards/chosen": 1.471093773841858, "rewards/margins": 1.0954583883285522, "rewards/rejected": 0.3756353557109833, "step": 463 }, { "epoch": 0.1, "learning_rate": 9.988162335381077e-06, "logits/chosen": -1.5776488780975342, "logits/rejected": -1.627002239227295, "logps/chosen": -128.3482208251953, "logps/rejected": -156.4710693359375, "loss": 0.2456, "rewards/accuracies": 1.0, "rewards/chosen": 8.031691551208496, "rewards/margins": 0.6931052207946777, "rewards/rejected": 7.338586330413818, "step": 464 }, { "epoch": 0.1, "learning_rate": 9.988038754573245e-06, "logits/chosen": -1.3271185159683228, "logits/rejected": -1.291821002960205, "logps/chosen": -33.13319396972656, "logps/rejected": -91.2886962890625, "loss": 0.7483, "rewards/accuracies": 0.0, "rewards/chosen": 1.588016152381897, "rewards/margins": -0.3049968481063843, "rewards/rejected": 1.8930130004882812, "step": 465 }, { "epoch": 0.1, "learning_rate": 9.987914532814425e-06, "logits/chosen": -1.1371921300888062, "logits/rejected": -0.9837134480476379, "logps/chosen": -74.2362060546875, "logps/rejected": -75.42726135253906, "loss": 1.0742, "rewards/accuracies": 1.0, "rewards/chosen": 3.7953217029571533, "rewards/margins": 1.2824897766113281, "rewards/rejected": 2.512831926345825, "step": 466 }, { "epoch": 0.1, "learning_rate": 9.987789670120578e-06, "logits/chosen": -0.9211419820785522, "logits/rejected": -0.8385159969329834, "logps/chosen": -53.93684387207031, "logps/rejected": -29.682479858398438, "loss": 0.2455, "rewards/accuracies": 1.0, "rewards/chosen": 2.500704288482666, "rewards/margins": 0.48158693313598633, "rewards/rejected": 2.0191173553466797, "step": 467 }, { "epoch": 0.1, "learning_rate": 9.987664166507749e-06, "logits/chosen": -0.965005099773407, "logits/rejected": -0.8191119432449341, "logps/chosen": -57.81186294555664, "logps/rejected": -26.395282745361328, "loss": 0.3396, "rewards/accuracies": 1.0, "rewards/chosen": 3.9661052227020264, "rewards/margins": 2.9102725982666016, "rewards/rejected": 1.0558327436447144, "step": 468 }, { "epoch": 0.1, "learning_rate": 9.987538021992063e-06, "logits/chosen": -1.0547014474868774, "logits/rejected": -0.974607527256012, "logps/chosen": -38.04062271118164, "logps/rejected": -37.35325241088867, "loss": 1.0538, "rewards/accuracies": 1.0, "rewards/chosen": 2.032876968383789, "rewards/margins": 0.6433783769607544, "rewards/rejected": 1.3894985914230347, "step": 469 }, { "epoch": 0.1, "learning_rate": 9.987411236589733e-06, "logits/chosen": -1.5974072217941284, "logits/rejected": -1.49229097366333, "logps/chosen": -140.39651489257812, "logps/rejected": -102.14701843261719, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 6.271841526031494, "rewards/margins": 2.8675079345703125, "rewards/rejected": 3.4043335914611816, "step": 470 }, { "epoch": 0.1, "learning_rate": 9.987283810317046e-06, "logits/chosen": -1.2508469820022583, "logits/rejected": -1.2424733638763428, "logps/chosen": -57.7360954284668, "logps/rejected": -78.52904510498047, "loss": 0.9709, "rewards/accuracies": 0.0, "rewards/chosen": 2.1387486457824707, "rewards/margins": -1.727515697479248, "rewards/rejected": 3.8662643432617188, "step": 471 }, { "epoch": 0.1, "learning_rate": 9.987155743190379e-06, "logits/chosen": -1.522838830947876, "logits/rejected": -1.406341552734375, "logps/chosen": -91.21753692626953, "logps/rejected": -44.54949188232422, "loss": 0.355, "rewards/accuracies": 1.0, "rewards/chosen": 6.474263668060303, "rewards/margins": 2.6629927158355713, "rewards/rejected": 3.8112709522247314, "step": 472 }, { "epoch": 0.1, "learning_rate": 9.98702703522619e-06, "logits/chosen": -0.9462299346923828, "logits/rejected": -0.8852963447570801, "logps/chosen": -49.083824157714844, "logps/rejected": -64.33307647705078, "loss": 0.8622, "rewards/accuracies": 1.0, "rewards/chosen": 1.8138427734375, "rewards/margins": 0.26597750186920166, "rewards/rejected": 1.5478652715682983, "step": 473 }, { "epoch": 0.1, "learning_rate": 9.986897686441012e-06, "logits/chosen": -0.9832401275634766, "logits/rejected": -0.9480679035186768, "logps/chosen": -34.974517822265625, "logps/rejected": -64.39958953857422, "loss": 2.2815, "rewards/accuracies": 1.0, "rewards/chosen": 2.999924421310425, "rewards/margins": 1.8940008878707886, "rewards/rejected": 1.1059235334396362, "step": 474 }, { "epoch": 0.11, "learning_rate": 9.986767696851472e-06, "logits/chosen": -0.9950131773948669, "logits/rejected": -1.0022931098937988, "logps/chosen": -58.10491943359375, "logps/rejected": -44.52827072143555, "loss": 1.093, "rewards/accuracies": 0.0, "rewards/chosen": 2.0723159313201904, "rewards/margins": -1.1558308601379395, "rewards/rejected": 3.22814679145813, "step": 475 }, { "epoch": 0.11, "learning_rate": 9.98663706647427e-06, "logits/chosen": -1.2882486581802368, "logits/rejected": -1.177573800086975, "logps/chosen": -123.90797424316406, "logps/rejected": -49.5938720703125, "loss": 0.893, "rewards/accuracies": 1.0, "rewards/chosen": 5.764289855957031, "rewards/margins": 3.511953592300415, "rewards/rejected": 2.252336263656616, "step": 476 }, { "epoch": 0.11, "learning_rate": 9.986505795326194e-06, "logits/chosen": -1.2177449464797974, "logits/rejected": -1.2075796127319336, "logps/chosen": -169.21841430664062, "logps/rejected": -181.84011840820312, "loss": 1.5507, "rewards/accuracies": 0.0, "rewards/chosen": 6.437524318695068, "rewards/margins": -1.9834809303283691, "rewards/rejected": 8.421005249023438, "step": 477 }, { "epoch": 0.11, "learning_rate": 9.986373883424108e-06, "logits/chosen": -1.0639935731887817, "logits/rejected": -0.9350028038024902, "logps/chosen": -34.17768478393555, "logps/rejected": -7.76352596282959, "loss": 0.4135, "rewards/accuracies": 1.0, "rewards/chosen": 2.119915008544922, "rewards/margins": 1.4594078063964844, "rewards/rejected": 0.6605071425437927, "step": 478 }, { "epoch": 0.11, "learning_rate": 9.986241330784967e-06, "logits/chosen": -1.1535379886627197, "logits/rejected": -1.0997154712677002, "logps/chosen": -64.76923370361328, "logps/rejected": -82.88432312011719, "loss": 0.6653, "rewards/accuracies": 0.0, "rewards/chosen": 1.8400276899337769, "rewards/margins": -0.9118384122848511, "rewards/rejected": 2.751866102218628, "step": 479 }, { "epoch": 0.11, "learning_rate": 9.9861081374258e-06, "logits/chosen": -1.1010668277740479, "logits/rejected": -1.1010668277740479, "logps/chosen": -140.7203369140625, "logps/rejected": -140.7203369140625, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 5.636435031890869, "rewards/margins": 0.0, "rewards/rejected": 5.636435031890869, "step": 480 }, { "epoch": 0.11, "learning_rate": 9.985974303363723e-06, "logits/chosen": -0.8680778741836548, "logits/rejected": -0.37997880578041077, "logps/chosen": -56.87001037597656, "logps/rejected": -84.40143585205078, "loss": 0.8049, "rewards/accuracies": 0.0, "rewards/chosen": 1.8626854419708252, "rewards/margins": -1.3618712425231934, "rewards/rejected": 3.2245566844940186, "step": 481 }, { "epoch": 0.11, "learning_rate": 9.985839828615937e-06, "logits/chosen": -1.005347490310669, "logits/rejected": -1.0475417375564575, "logps/chosen": -103.4289779663086, "logps/rejected": -55.994991302490234, "loss": 1.318, "rewards/accuracies": 0.0, "rewards/chosen": 2.703695058822632, "rewards/margins": -2.554860830307007, "rewards/rejected": 5.258555889129639, "step": 482 }, { "epoch": 0.11, "learning_rate": 9.985704713199715e-06, "logits/chosen": -1.0341236591339111, "logits/rejected": -0.9682067036628723, "logps/chosen": -65.67146301269531, "logps/rejected": -64.66891479492188, "loss": 0.9175, "rewards/accuracies": 1.0, "rewards/chosen": 2.204538106918335, "rewards/margins": 0.6568986177444458, "rewards/rejected": 1.5476394891738892, "step": 483 }, { "epoch": 0.11, "learning_rate": 9.985568957132425e-06, "logits/chosen": -1.003737211227417, "logits/rejected": -0.9757958650588989, "logps/chosen": -50.86979675292969, "logps/rejected": -70.00514221191406, "loss": 0.65, "rewards/accuracies": 0.0, "rewards/chosen": 1.3963119983673096, "rewards/margins": -0.09338295459747314, "rewards/rejected": 1.4896949529647827, "step": 484 }, { "epoch": 0.11, "learning_rate": 9.98543256043151e-06, "logits/chosen": -1.0052800178527832, "logits/rejected": -1.1619385480880737, "logps/chosen": -39.55171203613281, "logps/rejected": -43.78768539428711, "loss": 0.1484, "rewards/accuracies": 1.0, "rewards/chosen": 3.114521026611328, "rewards/margins": 1.1101016998291016, "rewards/rejected": 2.0044193267822266, "step": 485 }, { "epoch": 0.11, "learning_rate": 9.985295523114492e-06, "logits/chosen": -1.067940354347229, "logits/rejected": -0.9159751534461975, "logps/chosen": -150.16827392578125, "logps/rejected": -37.37651443481445, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 5.210475444793701, "rewards/margins": 4.213963031768799, "rewards/rejected": 0.9965125918388367, "step": 486 }, { "epoch": 0.11, "learning_rate": 9.985157845198987e-06, "logits/chosen": -0.7928709387779236, "logits/rejected": -0.44388753175735474, "logps/chosen": -65.82759857177734, "logps/rejected": -51.90214157104492, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 2.5929818153381348, "rewards/margins": 2.7159643173217773, "rewards/rejected": -0.12298240512609482, "step": 487 }, { "epoch": 0.11, "learning_rate": 9.985019526702682e-06, "logits/chosen": -1.1586858034133911, "logits/rejected": -1.0033314228057861, "logps/chosen": -183.94442749023438, "logps/rejected": -115.67064666748047, "loss": 0.3591, "rewards/accuracies": 1.0, "rewards/chosen": 6.57208251953125, "rewards/margins": 3.3560492992401123, "rewards/rejected": 3.2160332202911377, "step": 488 }, { "epoch": 0.11, "learning_rate": 9.984880567643351e-06, "logits/chosen": -1.2616517543792725, "logits/rejected": -1.2499488592147827, "logps/chosen": -79.36278533935547, "logps/rejected": -59.60911560058594, "loss": 0.3834, "rewards/accuracies": 1.0, "rewards/chosen": 4.925632476806641, "rewards/margins": 0.6078939437866211, "rewards/rejected": 4.3177385330200195, "step": 489 }, { "epoch": 0.11, "learning_rate": 9.984740968038852e-06, "logits/chosen": -0.935750424861908, "logits/rejected": -0.9516734480857849, "logps/chosen": -89.23179626464844, "logps/rejected": -98.85867309570312, "loss": 1.3426, "rewards/accuracies": 0.0, "rewards/chosen": 4.650122165679932, "rewards/margins": -2.144131660461426, "rewards/rejected": 6.794253826141357, "step": 490 }, { "epoch": 0.11, "learning_rate": 9.984600727907119e-06, "logits/chosen": -0.8334212303161621, "logits/rejected": -0.8381516933441162, "logps/chosen": -9.8496675491333, "logps/rejected": -23.36496353149414, "loss": 0.3352, "rewards/accuracies": 1.0, "rewards/chosen": 0.8591143488883972, "rewards/margins": 0.400934100151062, "rewards/rejected": 0.4581802487373352, "step": 491 }, { "epoch": 0.11, "learning_rate": 9.984459847266176e-06, "logits/chosen": -0.6573788523674011, "logits/rejected": -0.534971296787262, "logps/chosen": -87.63063049316406, "logps/rejected": -42.489051818847656, "loss": 1.0706, "rewards/accuracies": 1.0, "rewards/chosen": 3.839512586593628, "rewards/margins": 1.887905478477478, "rewards/rejected": 1.95160710811615, "step": 492 }, { "epoch": 0.11, "learning_rate": 9.984318326134125e-06, "logits/chosen": -0.9037403464317322, "logits/rejected": -0.8608068823814392, "logps/chosen": -32.57612609863281, "logps/rejected": -71.30891418457031, "loss": 0.7786, "rewards/accuracies": 1.0, "rewards/chosen": 2.3166725635528564, "rewards/margins": 0.48950040340423584, "rewards/rejected": 1.8271721601486206, "step": 493 }, { "epoch": 0.11, "learning_rate": 9.984176164529151e-06, "logits/chosen": -1.4799163341522217, "logits/rejected": -1.4713048934936523, "logps/chosen": -124.03541564941406, "logps/rejected": -100.93470764160156, "loss": 0.588, "rewards/accuracies": 0.0, "rewards/chosen": 4.043800354003906, "rewards/margins": -0.7434248924255371, "rewards/rejected": 4.787225246429443, "step": 494 }, { "epoch": 0.11, "learning_rate": 9.984033362469522e-06, "logits/chosen": -0.9513623118400574, "logits/rejected": -0.8274616599082947, "logps/chosen": -105.27317810058594, "logps/rejected": -63.67306137084961, "loss": 0.5282, "rewards/accuracies": 1.0, "rewards/chosen": 2.7497849464416504, "rewards/margins": 0.05817842483520508, "rewards/rejected": 2.6916065216064453, "step": 495 }, { "epoch": 0.11, "learning_rate": 9.983889919973586e-06, "logits/chosen": -1.123485803604126, "logits/rejected": -0.9614908695220947, "logps/chosen": -100.99211120605469, "logps/rejected": -63.291419982910156, "loss": 0.1038, "rewards/accuracies": 1.0, "rewards/chosen": 4.811952114105225, "rewards/margins": 1.599928855895996, "rewards/rejected": 3.2120232582092285, "step": 496 }, { "epoch": 0.11, "learning_rate": 9.983745837059777e-06, "logits/chosen": -0.8141000270843506, "logits/rejected": -0.6775975227355957, "logps/chosen": -120.30287170410156, "logps/rejected": -57.80274963378906, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": 5.330038547515869, "rewards/margins": 3.1759049892425537, "rewards/rejected": 2.1541335582733154, "step": 497 }, { "epoch": 0.11, "learning_rate": 9.98360111374661e-06, "logits/chosen": -1.0651856660842896, "logits/rejected": -1.1350045204162598, "logps/chosen": -73.65530395507812, "logps/rejected": -111.94143676757812, "loss": 1.1372, "rewards/accuracies": 0.0, "rewards/chosen": 4.4535136222839355, "rewards/margins": -2.151320457458496, "rewards/rejected": 6.604834079742432, "step": 498 }, { "epoch": 0.11, "learning_rate": 9.983455750052678e-06, "logits/chosen": -0.9430989623069763, "logits/rejected": -0.8242587447166443, "logps/chosen": -45.893043518066406, "logps/rejected": -70.92997741699219, "loss": 1.1232, "rewards/accuracies": 1.0, "rewards/chosen": 3.718045949935913, "rewards/margins": 1.2331788539886475, "rewards/rejected": 2.4848670959472656, "step": 499 }, { "epoch": 0.11, "learning_rate": 9.983309745996663e-06, "logits/chosen": -1.0455940961837769, "logits/rejected": -1.0241358280181885, "logps/chosen": -54.98444366455078, "logps/rejected": -46.17474365234375, "loss": 2.651, "rewards/accuracies": 1.0, "rewards/chosen": 2.2770402431488037, "rewards/margins": 0.34341979026794434, "rewards/rejected": 1.9336204528808594, "step": 500 }, { "epoch": 0.11, "learning_rate": 9.983163101597325e-06, "logits/chosen": -1.0052714347839355, "logits/rejected": -0.8407289385795593, "logps/chosen": -154.9622039794922, "logps/rejected": -25.644851684570312, "loss": 0.1661, "rewards/accuracies": 1.0, "rewards/chosen": 3.519557237625122, "rewards/margins": 1.0755562782287598, "rewards/rejected": 2.4440009593963623, "step": 501 }, { "epoch": 0.11, "learning_rate": 9.983015816873508e-06, "logits/chosen": -1.2946116924285889, "logits/rejected": -0.9478684663772583, "logps/chosen": -200.89002990722656, "logps/rejected": -73.83406066894531, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 6.946778774261475, "rewards/margins": 5.875274658203125, "rewards/rejected": 1.0715042352676392, "step": 502 }, { "epoch": 0.11, "learning_rate": 9.982867891844136e-06, "logits/chosen": -0.8154950737953186, "logits/rejected": -0.7546623349189758, "logps/chosen": -61.5232048034668, "logps/rejected": -61.589874267578125, "loss": 0.2012, "rewards/accuracies": 1.0, "rewards/chosen": 2.1695058345794678, "rewards/margins": 0.764732837677002, "rewards/rejected": 1.4047729969024658, "step": 503 }, { "epoch": 0.11, "learning_rate": 9.98271932652822e-06, "logits/chosen": -1.1802014112472534, "logits/rejected": -1.1546598672866821, "logps/chosen": -48.560951232910156, "logps/rejected": -105.31280517578125, "loss": 0.8718, "rewards/accuracies": 0.0, "rewards/chosen": 1.8460968732833862, "rewards/margins": -0.8285888433456421, "rewards/rejected": 2.6746857166290283, "step": 504 }, { "epoch": 0.11, "learning_rate": 9.982570120944847e-06, "logits/chosen": -0.9443962574005127, "logits/rejected": -0.9543178677558899, "logps/chosen": -71.89118957519531, "logps/rejected": -55.40229797363281, "loss": 3.0066, "rewards/accuracies": 1.0, "rewards/chosen": 2.0248069763183594, "rewards/margins": 0.135101318359375, "rewards/rejected": 1.8897056579589844, "step": 505 }, { "epoch": 0.11, "learning_rate": 9.982420275113194e-06, "logits/chosen": -1.0424197912216187, "logits/rejected": -1.1749591827392578, "logps/chosen": -79.48885345458984, "logps/rejected": -113.88197326660156, "loss": 1.4955, "rewards/accuracies": 0.0, "rewards/chosen": 3.8561713695526123, "rewards/margins": -2.7834742069244385, "rewards/rejected": 6.639645576477051, "step": 506 }, { "epoch": 0.11, "learning_rate": 9.98226978905251e-06, "logits/chosen": -1.2374787330627441, "logits/rejected": -1.082432508468628, "logps/chosen": -137.0396270751953, "logps/rejected": -71.69029235839844, "loss": 0.2699, "rewards/accuracies": 1.0, "rewards/chosen": 4.621028423309326, "rewards/margins": 0.3418145179748535, "rewards/rejected": 4.279213905334473, "step": 507 }, { "epoch": 0.11, "learning_rate": 9.982118662782136e-06, "logits/chosen": -0.9253109097480774, "logits/rejected": -0.9253109097480774, "logps/chosen": -55.819679260253906, "logps/rejected": -55.819679260253906, "loss": 0.3649, "rewards/accuracies": 0.0, "rewards/chosen": 2.3378396034240723, "rewards/margins": 0.0, "rewards/rejected": 2.3378396034240723, "step": 508 }, { "epoch": 0.11, "learning_rate": 9.981966896321492e-06, "logits/chosen": -0.9067976474761963, "logits/rejected": -0.8419598937034607, "logps/chosen": -54.376441955566406, "logps/rejected": -92.74639129638672, "loss": 0.1714, "rewards/accuracies": 1.0, "rewards/chosen": 5.009521007537842, "rewards/margins": 1.1038286685943604, "rewards/rejected": 3.9056923389434814, "step": 509 }, { "epoch": 0.11, "learning_rate": 9.981814489690077e-06, "logits/chosen": -1.217690110206604, "logits/rejected": -0.7295594811439514, "logps/chosen": -174.55294799804688, "logps/rejected": -26.217315673828125, "loss": 0.289, "rewards/accuracies": 1.0, "rewards/chosen": 4.100757122039795, "rewards/margins": 3.5354366302490234, "rewards/rejected": 0.5653204321861267, "step": 510 }, { "epoch": 0.11, "learning_rate": 9.981661442907477e-06, "logits/chosen": -0.9384743571281433, "logits/rejected": -0.897858738899231, "logps/chosen": -59.36105728149414, "logps/rejected": -37.70536422729492, "loss": 0.2427, "rewards/accuracies": 1.0, "rewards/chosen": 2.426701068878174, "rewards/margins": 0.48569726943969727, "rewards/rejected": 1.9410037994384766, "step": 511 }, { "epoch": 0.11, "learning_rate": 9.981507755993357e-06, "logits/chosen": -0.9319263696670532, "logits/rejected": -0.9440282583236694, "logps/chosen": -66.17803955078125, "logps/rejected": -76.7359390258789, "loss": 0.4132, "rewards/accuracies": 1.0, "rewards/chosen": 4.062819004058838, "rewards/margins": 2.2347803115844727, "rewards/rejected": 1.8280388116836548, "step": 512 }, { "epoch": 0.11, "learning_rate": 9.981353428967465e-06, "logits/chosen": -0.9979622960090637, "logits/rejected": -0.9636707901954651, "logps/chosen": -123.59066009521484, "logps/rejected": -146.32040405273438, "loss": 1.448, "rewards/accuracies": 0.0, "rewards/chosen": 5.302516937255859, "rewards/margins": -2.4758386611938477, "rewards/rejected": 7.778355598449707, "step": 513 }, { "epoch": 0.11, "learning_rate": 9.98119846184963e-06, "logits/chosen": -0.8891828656196594, "logits/rejected": -1.0188782215118408, "logps/chosen": -87.75247192382812, "logps/rejected": -103.18305969238281, "loss": 2.9901, "rewards/accuracies": 0.0, "rewards/chosen": 2.967031955718994, "rewards/margins": -3.8342790603637695, "rewards/rejected": 6.801311016082764, "step": 514 }, { "epoch": 0.11, "learning_rate": 9.98104285465977e-06, "logits/chosen": -1.0996203422546387, "logits/rejected": -1.1210126876831055, "logps/chosen": -81.72764587402344, "logps/rejected": -107.57350158691406, "loss": 0.281, "rewards/accuracies": 1.0, "rewards/chosen": 6.232041835784912, "rewards/margins": 0.4629530906677246, "rewards/rejected": 5.7690887451171875, "step": 515 }, { "epoch": 0.11, "learning_rate": 9.980886607417877e-06, "logits/chosen": -0.7167373895645142, "logits/rejected": -0.7167373895645142, "logps/chosen": -75.43202209472656, "logps/rejected": -75.43202209472656, "loss": 0.3639, "rewards/accuracies": 0.0, "rewards/chosen": 1.6779876947402954, "rewards/margins": 0.0, "rewards/rejected": 1.6779876947402954, "step": 516 }, { "epoch": 0.11, "learning_rate": 9.980729720144027e-06, "logits/chosen": -1.2719923257827759, "logits/rejected": -1.1837114095687866, "logps/chosen": -140.42755126953125, "logps/rejected": -45.949703216552734, "loss": 0.5143, "rewards/accuracies": 1.0, "rewards/chosen": 5.8651018142700195, "rewards/margins": 2.8695309162139893, "rewards/rejected": 2.9955708980560303, "step": 517 }, { "epoch": 0.11, "learning_rate": 9.980572192858383e-06, "logits/chosen": -0.7863404750823975, "logits/rejected": -0.7928597927093506, "logps/chosen": -96.71295928955078, "logps/rejected": -91.79585266113281, "loss": 2.2407, "rewards/accuracies": 1.0, "rewards/chosen": 4.5099663734436035, "rewards/margins": 1.7596588134765625, "rewards/rejected": 2.750307559967041, "step": 518 }, { "epoch": 0.11, "learning_rate": 9.980414025581185e-06, "logits/chosen": -0.9898003339767456, "logits/rejected": -0.953047513961792, "logps/chosen": -47.59535217285156, "logps/rejected": -49.31460952758789, "loss": 0.5654, "rewards/accuracies": 1.0, "rewards/chosen": 2.3635613918304443, "rewards/margins": 0.6201932430267334, "rewards/rejected": 1.743368148803711, "step": 519 }, { "epoch": 0.12, "learning_rate": 9.980255218332758e-06, "logits/chosen": -1.1257224082946777, "logits/rejected": -1.1257224082946777, "logps/chosen": -111.18411254882812, "logps/rejected": -111.18411254882812, "loss": 0.3469, "rewards/accuracies": 0.0, "rewards/chosen": 5.137475490570068, "rewards/margins": 0.0, "rewards/rejected": 5.137475490570068, "step": 520 }, { "epoch": 0.12, "learning_rate": 9.980095771133504e-06, "logits/chosen": -1.0827797651290894, "logits/rejected": -1.093344807624817, "logps/chosen": -56.076210021972656, "logps/rejected": -53.13554763793945, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 2.4412765502929688, "rewards/margins": -1.0693128108978271, "rewards/rejected": 3.510589361190796, "step": 521 }, { "epoch": 0.12, "learning_rate": 9.979935684003918e-06, "logits/chosen": -0.8794252872467041, "logits/rejected": -0.8486648797988892, "logps/chosen": -68.73431396484375, "logps/rejected": -103.39996337890625, "loss": 0.848, "rewards/accuracies": 0.0, "rewards/chosen": 3.1597015857696533, "rewards/margins": -0.8054261207580566, "rewards/rejected": 3.96512770652771, "step": 522 }, { "epoch": 0.12, "learning_rate": 9.979774956964569e-06, "logits/chosen": -0.9271775484085083, "logits/rejected": -0.8402308225631714, "logps/chosen": -107.09843444824219, "logps/rejected": -75.51335906982422, "loss": 0.3444, "rewards/accuracies": 1.0, "rewards/chosen": 4.564919948577881, "rewards/margins": 2.054048776626587, "rewards/rejected": 2.510871171951294, "step": 523 }, { "epoch": 0.12, "learning_rate": 9.979613590036108e-06, "logits/chosen": -1.0358983278274536, "logits/rejected": -1.0358983278274536, "logps/chosen": -169.9907989501953, "logps/rejected": -169.9907989501953, "loss": 0.3484, "rewards/accuracies": 0.0, "rewards/chosen": 6.0798020362854, "rewards/margins": 0.0, "rewards/rejected": 6.0798020362854, "step": 524 }, { "epoch": 0.12, "learning_rate": 9.979451583239272e-06, "logits/chosen": -1.101149082183838, "logits/rejected": -0.9619909524917603, "logps/chosen": -92.73119354248047, "logps/rejected": -67.92132568359375, "loss": 0.0823, "rewards/accuracies": 1.0, "rewards/chosen": 6.153165340423584, "rewards/margins": 2.8452017307281494, "rewards/rejected": 3.3079636096954346, "step": 525 }, { "epoch": 0.12, "learning_rate": 9.979288936594877e-06, "logits/chosen": -0.6553951501846313, "logits/rejected": -0.6683906316757202, "logps/chosen": -45.08233642578125, "logps/rejected": -67.98410034179688, "loss": 1.8856, "rewards/accuracies": 0.0, "rewards/chosen": 1.8781646490097046, "rewards/margins": -1.2783998250961304, "rewards/rejected": 3.156564474105835, "step": 526 }, { "epoch": 0.12, "learning_rate": 9.979125650123824e-06, "logits/chosen": -0.9177359342575073, "logits/rejected": -1.0205131769180298, "logps/chosen": -44.428489685058594, "logps/rejected": -110.57487487792969, "loss": 0.6764, "rewards/accuracies": 0.0, "rewards/chosen": 2.1700119972229004, "rewards/margins": -0.28714585304260254, "rewards/rejected": 2.457157850265503, "step": 527 }, { "epoch": 0.12, "learning_rate": 9.978961723847093e-06, "logits/chosen": -0.9349456429481506, "logits/rejected": -0.9628576040267944, "logps/chosen": -62.32330322265625, "logps/rejected": -95.72000885009766, "loss": 0.8783, "rewards/accuracies": 0.0, "rewards/chosen": 4.740292549133301, "rewards/margins": -1.061861515045166, "rewards/rejected": 5.802154064178467, "step": 528 }, { "epoch": 0.12, "learning_rate": 9.978797157785752e-06, "logits/chosen": -1.1399093866348267, "logits/rejected": -1.1035772562026978, "logps/chosen": -44.391666412353516, "logps/rejected": -95.99197387695312, "loss": 0.4919, "rewards/accuracies": 0.0, "rewards/chosen": 2.794010639190674, "rewards/margins": -0.05705833435058594, "rewards/rejected": 2.8510689735412598, "step": 529 }, { "epoch": 0.12, "learning_rate": 9.978631951960942e-06, "logits/chosen": -0.7126337885856628, "logits/rejected": -0.6936812400817871, "logps/chosen": -42.178680419921875, "logps/rejected": -58.573001861572266, "loss": 0.3997, "rewards/accuracies": 0.0, "rewards/chosen": 2.0648884773254395, "rewards/margins": -0.15594935417175293, "rewards/rejected": 2.2208378314971924, "step": 530 }, { "epoch": 0.12, "learning_rate": 9.978466106393896e-06, "logits/chosen": -0.9798562526702881, "logits/rejected": -0.8937344551086426, "logps/chosen": -95.06817626953125, "logps/rejected": -50.0731201171875, "loss": 1.4289, "rewards/accuracies": 0.0, "rewards/chosen": 2.2637810707092285, "rewards/margins": -0.7855224609375, "rewards/rejected": 3.0493035316467285, "step": 531 }, { "epoch": 0.12, "learning_rate": 9.978299621105924e-06, "logits/chosen": -1.230849266052246, "logits/rejected": -1.0189759731292725, "logps/chosen": -136.11550903320312, "logps/rejected": -45.37645721435547, "loss": 0.5317, "rewards/accuracies": 1.0, "rewards/chosen": 6.042944431304932, "rewards/margins": 4.304898262023926, "rewards/rejected": 1.7380459308624268, "step": 532 }, { "epoch": 0.12, "learning_rate": 9.978132496118418e-06, "logits/chosen": -0.8406463861465454, "logits/rejected": -0.7337409257888794, "logps/chosen": -111.26182556152344, "logps/rejected": -103.7058334350586, "loss": 0.7487, "rewards/accuracies": 0.0, "rewards/chosen": 3.6108016967773438, "rewards/margins": -0.22208476066589355, "rewards/rejected": 3.8328864574432373, "step": 533 }, { "epoch": 0.12, "learning_rate": 9.977964731452852e-06, "logits/chosen": -1.2691017389297485, "logits/rejected": -1.2691017389297485, "logps/chosen": -83.92389678955078, "logps/rejected": -83.92389678955078, "loss": 0.7251, "rewards/accuracies": 0.0, "rewards/chosen": 3.6737143993377686, "rewards/margins": 0.0, "rewards/rejected": 3.6737143993377686, "step": 534 }, { "epoch": 0.12, "learning_rate": 9.977796327130786e-06, "logits/chosen": -1.177045226097107, "logits/rejected": -1.1122441291809082, "logps/chosen": -78.35442352294922, "logps/rejected": -73.92045593261719, "loss": 0.4055, "rewards/accuracies": 0.0, "rewards/chosen": 3.0280396938323975, "rewards/margins": -0.00754547119140625, "rewards/rejected": 3.0355851650238037, "step": 535 }, { "epoch": 0.12, "learning_rate": 9.977627283173858e-06, "logits/chosen": -0.926159679889679, "logits/rejected": -0.8092381954193115, "logps/chosen": -80.30366516113281, "logps/rejected": -87.08832550048828, "loss": 0.7517, "rewards/accuracies": 1.0, "rewards/chosen": 4.447749614715576, "rewards/margins": 1.568100929260254, "rewards/rejected": 2.8796486854553223, "step": 536 }, { "epoch": 0.12, "learning_rate": 9.97745759960379e-06, "logits/chosen": -1.3544992208480835, "logits/rejected": -1.015575885772705, "logps/chosen": -105.70531463623047, "logps/rejected": -48.60778045654297, "loss": 0.1903, "rewards/accuracies": 1.0, "rewards/chosen": 5.651072025299072, "rewards/margins": 3.4765548706054688, "rewards/rejected": 2.1745171546936035, "step": 537 }, { "epoch": 0.12, "learning_rate": 9.977287276442385e-06, "logits/chosen": -0.8725242018699646, "logits/rejected": -0.9663902521133423, "logps/chosen": -59.735023498535156, "logps/rejected": -46.856666564941406, "loss": 1.3571, "rewards/accuracies": 0.0, "rewards/chosen": 3.0165505409240723, "rewards/margins": -2.244210720062256, "rewards/rejected": 5.260761260986328, "step": 538 }, { "epoch": 0.12, "learning_rate": 9.97711631371153e-06, "logits/chosen": -1.0998804569244385, "logits/rejected": -1.0975759029388428, "logps/chosen": -81.65946960449219, "logps/rejected": -65.2856216430664, "loss": 0.5508, "rewards/accuracies": 1.0, "rewards/chosen": 3.0216453075408936, "rewards/margins": 0.6578505039215088, "rewards/rejected": 2.3637948036193848, "step": 539 }, { "epoch": 0.12, "learning_rate": 9.976944711433194e-06, "logits/chosen": -0.8811697959899902, "logits/rejected": -0.8811697959899902, "logps/chosen": -48.381412506103516, "logps/rejected": -48.381412506103516, "loss": 0.4256, "rewards/accuracies": 0.0, "rewards/chosen": 1.664493203163147, "rewards/margins": 0.0, "rewards/rejected": 1.664493203163147, "step": 540 }, { "epoch": 0.12, "learning_rate": 9.976772469629428e-06, "logits/chosen": -1.2003891468048096, "logits/rejected": -1.0874184370040894, "logps/chosen": -84.06698608398438, "logps/rejected": -59.993896484375, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 7.161181926727295, "rewards/margins": 4.089765548706055, "rewards/rejected": 3.0714166164398193, "step": 541 }, { "epoch": 0.12, "learning_rate": 9.976599588322362e-06, "logits/chosen": -1.1521750688552856, "logits/rejected": -1.2025275230407715, "logps/chosen": -63.38039016723633, "logps/rejected": -98.08938598632812, "loss": 1.6338, "rewards/accuracies": 0.0, "rewards/chosen": 2.11755108833313, "rewards/margins": -3.209240198135376, "rewards/rejected": 5.326791286468506, "step": 542 }, { "epoch": 0.12, "learning_rate": 9.976426067534212e-06, "logits/chosen": -0.9110174775123596, "logits/rejected": -0.97939532995224, "logps/chosen": -50.90020751953125, "logps/rejected": -50.589561462402344, "loss": 1.2455, "rewards/accuracies": 0.0, "rewards/chosen": 1.5737335681915283, "rewards/margins": -1.8232483863830566, "rewards/rejected": 3.396981954574585, "step": 543 }, { "epoch": 0.12, "learning_rate": 9.976251907287277e-06, "logits/chosen": -1.361299991607666, "logits/rejected": -1.186069130897522, "logps/chosen": -211.68060302734375, "logps/rejected": -55.49190902709961, "loss": 0.1196, "rewards/accuracies": 1.0, "rewards/chosen": 5.996036052703857, "rewards/margins": 1.3400211334228516, "rewards/rejected": 4.656014919281006, "step": 544 }, { "epoch": 0.12, "learning_rate": 9.976077107603933e-06, "logits/chosen": -1.2968957424163818, "logits/rejected": -1.3047676086425781, "logps/chosen": -160.84619140625, "logps/rejected": -86.76174926757812, "loss": 1.257, "rewards/accuracies": 0.0, "rewards/chosen": 5.333059787750244, "rewards/margins": -2.376565456390381, "rewards/rejected": 7.709625244140625, "step": 545 }, { "epoch": 0.12, "learning_rate": 9.975901668506644e-06, "logits/chosen": -0.8735765814781189, "logits/rejected": -0.8735765814781189, "logps/chosen": -77.30643463134766, "logps/rejected": -77.30643463134766, "loss": 0.365, "rewards/accuracies": 0.0, "rewards/chosen": 1.3689559698104858, "rewards/margins": 0.0, "rewards/rejected": 1.3689559698104858, "step": 546 }, { "epoch": 0.12, "learning_rate": 9.97572559001795e-06, "logits/chosen": -1.352431297302246, "logits/rejected": -1.1956313848495483, "logps/chosen": -67.85432434082031, "logps/rejected": -48.49799346923828, "loss": 0.2378, "rewards/accuracies": 1.0, "rewards/chosen": 2.6918365955352783, "rewards/margins": 1.0050941705703735, "rewards/rejected": 1.6867424249649048, "step": 547 }, { "epoch": 0.12, "learning_rate": 9.975548872160482e-06, "logits/chosen": -1.0832239389419556, "logits/rejected": -0.8718990683555603, "logps/chosen": -135.56130981445312, "logps/rejected": -66.33885192871094, "loss": 1.3872, "rewards/accuracies": 1.0, "rewards/chosen": 6.0867462158203125, "rewards/margins": 0.3950462341308594, "rewards/rejected": 5.691699981689453, "step": 548 }, { "epoch": 0.12, "learning_rate": 9.975371514956945e-06, "logits/chosen": -1.3051340579986572, "logits/rejected": -1.1359918117523193, "logps/chosen": -62.03540802001953, "logps/rejected": -13.4114408493042, "loss": 0.1987, "rewards/accuracies": 1.0, "rewards/chosen": 1.5607147216796875, "rewards/margins": 0.7183558344841003, "rewards/rejected": 0.8423588871955872, "step": 549 }, { "epoch": 0.12, "learning_rate": 9.975193518430127e-06, "logits/chosen": -1.1372339725494385, "logits/rejected": -1.138287901878357, "logps/chosen": -51.267974853515625, "logps/rejected": -86.98597717285156, "loss": 1.3859, "rewards/accuracies": 0.0, "rewards/chosen": 0.8951042294502258, "rewards/margins": -2.300809621810913, "rewards/rejected": 3.195913791656494, "step": 550 }, { "epoch": 0.12, "learning_rate": 9.9750148826029e-06, "logits/chosen": -0.7147239446640015, "logits/rejected": -0.7147239446640015, "logps/chosen": -59.419883728027344, "logps/rejected": -59.419883728027344, "loss": 0.347, "rewards/accuracies": 0.0, "rewards/chosen": 2.2764549255371094, "rewards/margins": 0.0, "rewards/rejected": 2.2764549255371094, "step": 551 }, { "epoch": 0.12, "learning_rate": 9.974835607498224e-06, "logits/chosen": -1.0994956493377686, "logits/rejected": -0.8928612470626831, "logps/chosen": -163.51373291015625, "logps/rejected": -30.308853149414062, "loss": 0.1115, "rewards/accuracies": 1.0, "rewards/chosen": 3.6705689430236816, "rewards/margins": 2.934385061264038, "rewards/rejected": 0.7361839413642883, "step": 552 }, { "epoch": 0.12, "learning_rate": 9.97465569313913e-06, "logits/chosen": -0.8719661831855774, "logits/rejected": -0.8181272149085999, "logps/chosen": -63.39671325683594, "logps/rejected": -53.959224700927734, "loss": 2.3656, "rewards/accuracies": 1.0, "rewards/chosen": 3.3893425464630127, "rewards/margins": 1.515968680381775, "rewards/rejected": 1.8733738660812378, "step": 553 }, { "epoch": 0.12, "learning_rate": 9.974475139548738e-06, "logits/chosen": -0.6033105850219727, "logits/rejected": -0.5644059181213379, "logps/chosen": -89.78533935546875, "logps/rejected": -59.48767852783203, "loss": 2.3459, "rewards/accuracies": 0.0, "rewards/chosen": 2.594538927078247, "rewards/margins": -0.8078911304473877, "rewards/rejected": 3.4024300575256348, "step": 554 }, { "epoch": 0.12, "learning_rate": 9.97429394675025e-06, "logits/chosen": -1.0783964395523071, "logits/rejected": -1.2113440036773682, "logps/chosen": -84.85879516601562, "logps/rejected": -99.87403869628906, "loss": 3.7669, "rewards/accuracies": 0.0, "rewards/chosen": 1.638763427734375, "rewards/margins": -6.781015396118164, "rewards/rejected": 8.419778823852539, "step": 555 }, { "epoch": 0.12, "learning_rate": 9.974112114766945e-06, "logits/chosen": -0.9370518326759338, "logits/rejected": -0.9644907712936401, "logps/chosen": -38.69902801513672, "logps/rejected": -72.03638458251953, "loss": 1.0701, "rewards/accuracies": 0.0, "rewards/chosen": 1.892808198928833, "rewards/margins": -1.9105823040008545, "rewards/rejected": 3.8033905029296875, "step": 556 }, { "epoch": 0.12, "learning_rate": 9.973929643622194e-06, "logits/chosen": -1.4066860675811768, "logits/rejected": -1.3928837776184082, "logps/chosen": -50.370948791503906, "logps/rejected": -77.36813354492188, "loss": 0.162, "rewards/accuracies": 1.0, "rewards/chosen": 3.0652146339416504, "rewards/margins": 0.9779617786407471, "rewards/rejected": 2.0872528553009033, "step": 557 }, { "epoch": 0.12, "learning_rate": 9.973746533339438e-06, "logits/chosen": -1.02852463722229, "logits/rejected": -1.02852463722229, "logps/chosen": -155.28443908691406, "logps/rejected": -155.28443908691406, "loss": 0.3751, "rewards/accuracies": 0.0, "rewards/chosen": 4.571904182434082, "rewards/margins": 0.0, "rewards/rejected": 4.571904182434082, "step": 558 }, { "epoch": 0.12, "learning_rate": 9.97356278394221e-06, "logits/chosen": -1.1313183307647705, "logits/rejected": -1.083748459815979, "logps/chosen": -38.941856384277344, "logps/rejected": -26.128173828125, "loss": 0.2084, "rewards/accuracies": 1.0, "rewards/chosen": 2.632610321044922, "rewards/margins": 0.9441550970077515, "rewards/rejected": 1.6884552240371704, "step": 559 }, { "epoch": 0.12, "learning_rate": 9.973378395454121e-06, "logits/chosen": -1.1604244709014893, "logits/rejected": -1.1095731258392334, "logps/chosen": -38.40080642700195, "logps/rejected": -57.51567459106445, "loss": 1.1045, "rewards/accuracies": 0.0, "rewards/chosen": 2.7112972736358643, "rewards/margins": -1.7492883205413818, "rewards/rejected": 4.460585594177246, "step": 560 }, { "epoch": 0.12, "learning_rate": 9.973193367898863e-06, "logits/chosen": -1.242860198020935, "logits/rejected": -1.1772323846817017, "logps/chosen": -80.2684326171875, "logps/rejected": -75.52656555175781, "loss": 0.8208, "rewards/accuracies": 1.0, "rewards/chosen": 3.211650848388672, "rewards/margins": 0.6908957958221436, "rewards/rejected": 2.5207550525665283, "step": 561 }, { "epoch": 0.12, "learning_rate": 9.973007701300214e-06, "logits/chosen": -0.8862510919570923, "logits/rejected": -1.0229942798614502, "logps/chosen": -70.80963897705078, "logps/rejected": -154.92410278320312, "loss": 1.8637, "rewards/accuracies": 0.0, "rewards/chosen": 3.173442840576172, "rewards/margins": -3.121994972229004, "rewards/rejected": 6.295437812805176, "step": 562 }, { "epoch": 0.12, "learning_rate": 9.972821395682029e-06, "logits/chosen": -0.934666097164154, "logits/rejected": -0.8382720947265625, "logps/chosen": -66.81019592285156, "logps/rejected": -50.629425048828125, "loss": 0.5373, "rewards/accuracies": 1.0, "rewards/chosen": 3.002829074859619, "rewards/margins": 0.7669327259063721, "rewards/rejected": 2.235896348953247, "step": 563 }, { "epoch": 0.12, "learning_rate": 9.972634451068248e-06, "logits/chosen": -0.8106500506401062, "logits/rejected": -0.7543385624885559, "logps/chosen": -41.918235778808594, "logps/rejected": -66.31562805175781, "loss": 0.52, "rewards/accuracies": 0.0, "rewards/chosen": 2.4027962684631348, "rewards/margins": -0.5998489856719971, "rewards/rejected": 3.002645254135132, "step": 564 }, { "epoch": 0.13, "learning_rate": 9.972446867482896e-06, "logits/chosen": -1.2737165689468384, "logits/rejected": -1.1706047058105469, "logps/chosen": -90.92842864990234, "logps/rejected": -93.94347381591797, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 5.641332149505615, "rewards/margins": 3.991702079772949, "rewards/rejected": 1.6496299505233765, "step": 565 }, { "epoch": 0.13, "learning_rate": 9.972258644950074e-06, "logits/chosen": -0.8915891647338867, "logits/rejected": -0.8792617917060852, "logps/chosen": -47.24810028076172, "logps/rejected": -101.22190856933594, "loss": 0.359, "rewards/accuracies": 0.0, "rewards/chosen": 2.7858970165252686, "rewards/margins": -0.043872833251953125, "rewards/rejected": 2.8297698497772217, "step": 566 }, { "epoch": 0.13, "learning_rate": 9.97206978349397e-06, "logits/chosen": -1.366844654083252, "logits/rejected": -1.2423683404922485, "logps/chosen": -103.42880249023438, "logps/rejected": -100.0396957397461, "loss": 0.7651, "rewards/accuracies": 1.0, "rewards/chosen": 5.937826633453369, "rewards/margins": 0.6912469863891602, "rewards/rejected": 5.246579647064209, "step": 567 }, { "epoch": 0.13, "learning_rate": 9.971880283138849e-06, "logits/chosen": -1.0347466468811035, "logits/rejected": -1.0347466468811035, "logps/chosen": -51.701255798339844, "logps/rejected": -51.701255798339844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 2.0402932167053223, "rewards/margins": 0.0, "rewards/rejected": 2.0402932167053223, "step": 568 }, { "epoch": 0.13, "learning_rate": 9.971690143909066e-06, "logits/chosen": -0.9390620589256287, "logits/rejected": -1.074169635772705, "logps/chosen": -81.76659393310547, "logps/rejected": -71.26812744140625, "loss": 2.2014, "rewards/accuracies": 0.0, "rewards/chosen": 2.571981191635132, "rewards/margins": -3.4096410274505615, "rewards/rejected": 5.981622219085693, "step": 569 }, { "epoch": 0.13, "learning_rate": 9.971499365829049e-06, "logits/chosen": -1.4546196460723877, "logits/rejected": -1.4687345027923584, "logps/chosen": -116.41786193847656, "logps/rejected": -71.06974792480469, "loss": 0.2522, "rewards/accuracies": 1.0, "rewards/chosen": 7.904624938964844, "rewards/margins": 0.42840576171875, "rewards/rejected": 7.476219177246094, "step": 570 }, { "epoch": 0.13, "learning_rate": 9.971307948923316e-06, "logits/chosen": -1.1010762453079224, "logits/rejected": -1.1217498779296875, "logps/chosen": -114.17547607421875, "logps/rejected": -191.09625244140625, "loss": 0.7302, "rewards/accuracies": 0.0, "rewards/chosen": 8.78729248046875, "rewards/margins": -1.022674560546875, "rewards/rejected": 9.809967041015625, "step": 571 }, { "epoch": 0.13, "learning_rate": 9.971115893216463e-06, "logits/chosen": -0.8739522695541382, "logits/rejected": -0.7261659502983093, "logps/chosen": -45.41069793701172, "logps/rejected": -41.458946228027344, "loss": 0.3091, "rewards/accuracies": 1.0, "rewards/chosen": 3.0532472133636475, "rewards/margins": 0.26429080963134766, "rewards/rejected": 2.7889564037323, "step": 572 }, { "epoch": 0.13, "learning_rate": 9.970923198733167e-06, "logits/chosen": -0.9392648935317993, "logits/rejected": -0.9384757876396179, "logps/chosen": -29.300031661987305, "logps/rejected": -29.98308753967285, "loss": 0.5766, "rewards/accuracies": 0.0, "rewards/chosen": 0.9770715832710266, "rewards/margins": -0.6612086892127991, "rewards/rejected": 1.6382802724838257, "step": 573 }, { "epoch": 0.13, "learning_rate": 9.97072986549819e-06, "logits/chosen": -0.7269713878631592, "logits/rejected": -0.7001593708992004, "logps/chosen": -41.48028564453125, "logps/rejected": -59.15705871582031, "loss": 1.0846, "rewards/accuracies": 0.0, "rewards/chosen": 1.8153778314590454, "rewards/margins": -1.2166329622268677, "rewards/rejected": 3.032010793685913, "step": 574 }, { "epoch": 0.13, "learning_rate": 9.970535893536375e-06, "logits/chosen": -1.2730423212051392, "logits/rejected": -1.0364736318588257, "logps/chosen": -80.91554260253906, "logps/rejected": -13.259089469909668, "loss": 2.2727, "rewards/accuracies": 1.0, "rewards/chosen": 3.9751617908477783, "rewards/margins": 3.523966073989868, "rewards/rejected": 0.451195627450943, "step": 575 }, { "epoch": 0.13, "learning_rate": 9.970341282872645e-06, "logits/chosen": -0.7855873703956604, "logits/rejected": -0.8036335706710815, "logps/chosen": -30.516590118408203, "logps/rejected": -78.01873016357422, "loss": 2.0035, "rewards/accuracies": 0.0, "rewards/chosen": 1.5023167133331299, "rewards/margins": -3.387644052505493, "rewards/rejected": 4.889960765838623, "step": 576 }, { "epoch": 0.13, "learning_rate": 9.97014603353201e-06, "logits/chosen": -0.868736207485199, "logits/rejected": -0.8830854296684265, "logps/chosen": -60.729801177978516, "logps/rejected": -109.86590576171875, "loss": 2.1933, "rewards/accuracies": 0.0, "rewards/chosen": 2.433993101119995, "rewards/margins": -1.1553990840911865, "rewards/rejected": 3.5893921852111816, "step": 577 }, { "epoch": 0.13, "learning_rate": 9.969950145539557e-06, "logits/chosen": -0.8872392773628235, "logits/rejected": -0.9091609716415405, "logps/chosen": -72.50015258789062, "logps/rejected": -77.13002014160156, "loss": 1.7621, "rewards/accuracies": 0.0, "rewards/chosen": 1.6868194341659546, "rewards/margins": -3.3907790184020996, "rewards/rejected": 5.077598571777344, "step": 578 }, { "epoch": 0.13, "learning_rate": 9.969753618920456e-06, "logits/chosen": -1.6031752824783325, "logits/rejected": -1.5353262424468994, "logps/chosen": -128.67735290527344, "logps/rejected": -148.22113037109375, "loss": 0.6264, "rewards/accuracies": 0.0, "rewards/chosen": 4.6036696434021, "rewards/margins": -0.26044464111328125, "rewards/rejected": 4.864114284515381, "step": 579 }, { "epoch": 0.13, "learning_rate": 9.969556453699966e-06, "logits/chosen": -1.315438985824585, "logits/rejected": -1.2745296955108643, "logps/chosen": -47.57989501953125, "logps/rejected": -38.01961135864258, "loss": 0.8229, "rewards/accuracies": 1.0, "rewards/chosen": 2.4989585876464844, "rewards/margins": 0.15259289741516113, "rewards/rejected": 2.3463656902313232, "step": 580 }, { "epoch": 0.13, "learning_rate": 9.969358649903415e-06, "logits/chosen": -0.7855647206306458, "logits/rejected": -0.6801638603210449, "logps/chosen": -88.43793487548828, "logps/rejected": -58.52690124511719, "loss": 0.0623, "rewards/accuracies": 1.0, "rewards/chosen": 4.195678234100342, "rewards/margins": 2.251230239868164, "rewards/rejected": 1.9444481134414673, "step": 581 }, { "epoch": 0.13, "learning_rate": 9.969160207556225e-06, "logits/chosen": -1.0142983198165894, "logits/rejected": -1.1189149618148804, "logps/chosen": -60.331642150878906, "logps/rejected": -123.50259399414062, "loss": 1.1858, "rewards/accuracies": 0.0, "rewards/chosen": 2.956683397293091, "rewards/margins": -1.4718568325042725, "rewards/rejected": 4.428540229797363, "step": 582 }, { "epoch": 0.13, "learning_rate": 9.968961126683893e-06, "logits/chosen": -1.0831999778747559, "logits/rejected": -0.8526504039764404, "logps/chosen": -125.50682067871094, "logps/rejected": -37.2948112487793, "loss": 0.076, "rewards/accuracies": 1.0, "rewards/chosen": 6.102830410003662, "rewards/margins": 3.440478801727295, "rewards/rejected": 2.662351608276367, "step": 583 }, { "epoch": 0.13, "learning_rate": 9.968761407312002e-06, "logits/chosen": -1.4102096557617188, "logits/rejected": -1.2536228895187378, "logps/chosen": -106.25602722167969, "logps/rejected": -53.87908935546875, "loss": 0.8062, "rewards/accuracies": 1.0, "rewards/chosen": 6.342399597167969, "rewards/margins": 1.049466609954834, "rewards/rejected": 5.292932987213135, "step": 584 }, { "epoch": 0.13, "learning_rate": 9.968561049466214e-06, "logits/chosen": -1.4538536071777344, "logits/rejected": -1.4193453788757324, "logps/chosen": -66.69947814941406, "logps/rejected": -39.1663932800293, "loss": 1.7157, "rewards/accuracies": 0.0, "rewards/chosen": 2.2605645656585693, "rewards/margins": -1.5169627666473389, "rewards/rejected": 3.777527332305908, "step": 585 }, { "epoch": 0.13, "learning_rate": 9.968360053172275e-06, "logits/chosen": -1.1309152841567993, "logits/rejected": -1.0552607774734497, "logps/chosen": -65.29741668701172, "logps/rejected": -70.34663391113281, "loss": 0.4033, "rewards/accuracies": 0.0, "rewards/chosen": 2.520256757736206, "rewards/margins": -0.16564416885375977, "rewards/rejected": 2.685900926589966, "step": 586 }, { "epoch": 0.13, "learning_rate": 9.968158418456013e-06, "logits/chosen": -1.0768381357192993, "logits/rejected": -1.0768381357192993, "logps/chosen": -100.64556121826172, "logps/rejected": -100.64556121826172, "loss": 0.3729, "rewards/accuracies": 0.0, "rewards/chosen": 4.678775787353516, "rewards/margins": 0.0, "rewards/rejected": 4.678775787353516, "step": 587 }, { "epoch": 0.13, "learning_rate": 9.967956145343339e-06, "logits/chosen": -0.6667290925979614, "logits/rejected": -0.6039336323738098, "logps/chosen": -40.66505813598633, "logps/rejected": -18.183238983154297, "loss": 0.8707, "rewards/accuracies": 0.0, "rewards/chosen": 0.9686382412910461, "rewards/margins": -0.8742184042930603, "rewards/rejected": 1.8428566455841064, "step": 588 }, { "epoch": 0.13, "learning_rate": 9.96775323386024e-06, "logits/chosen": -0.8162684440612793, "logits/rejected": -0.7922945022583008, "logps/chosen": -39.91571807861328, "logps/rejected": -63.16123580932617, "loss": 0.6336, "rewards/accuracies": 0.0, "rewards/chosen": 1.9742447137832642, "rewards/margins": -0.9346195459365845, "rewards/rejected": 2.9088642597198486, "step": 589 }, { "epoch": 0.13, "learning_rate": 9.967549684032796e-06, "logits/chosen": -1.1510961055755615, "logits/rejected": -1.1129013299942017, "logps/chosen": -69.68771362304688, "logps/rejected": -73.6832275390625, "loss": 1.2959, "rewards/accuracies": 0.0, "rewards/chosen": 2.075946092605591, "rewards/margins": -2.0614569187164307, "rewards/rejected": 4.1374030113220215, "step": 590 }, { "epoch": 0.13, "learning_rate": 9.967345495887157e-06, "logits/chosen": -1.3111090660095215, "logits/rejected": -1.3018540143966675, "logps/chosen": -84.92273712158203, "logps/rejected": -135.05966186523438, "loss": 1.0242, "rewards/accuracies": 0.0, "rewards/chosen": 7.629136085510254, "rewards/margins": -1.7207145690917969, "rewards/rejected": 9.34985065460205, "step": 591 }, { "epoch": 0.13, "learning_rate": 9.967140669449562e-06, "logits/chosen": -1.6100136041641235, "logits/rejected": -1.5377322435379028, "logps/chosen": -55.975341796875, "logps/rejected": -11.968698501586914, "loss": 0.5065, "rewards/accuracies": 0.0, "rewards/chosen": 1.4465652704238892, "rewards/margins": -0.5246665477752686, "rewards/rejected": 1.9712318181991577, "step": 592 }, { "epoch": 0.13, "learning_rate": 9.966935204746332e-06, "logits/chosen": -0.9893565773963928, "logits/rejected": -0.9523180723190308, "logps/chosen": -88.74769592285156, "logps/rejected": -83.22789764404297, "loss": 0.6751, "rewards/accuracies": 1.0, "rewards/chosen": 3.415508270263672, "rewards/margins": 0.5488967895507812, "rewards/rejected": 2.8666114807128906, "step": 593 }, { "epoch": 0.13, "learning_rate": 9.966729101803872e-06, "logits/chosen": -0.8116081953048706, "logits/rejected": -0.7165506482124329, "logps/chosen": -111.0355224609375, "logps/rejected": -49.43412780761719, "loss": 0.3489, "rewards/accuracies": 1.0, "rewards/chosen": 2.429251194000244, "rewards/margins": 0.1722886562347412, "rewards/rejected": 2.256962537765503, "step": 594 }, { "epoch": 0.13, "learning_rate": 9.966522360648659e-06, "logits/chosen": -1.077925682067871, "logits/rejected": -1.0770552158355713, "logps/chosen": -43.16987991333008, "logps/rejected": -142.00584411621094, "loss": 1.7172, "rewards/accuracies": 0.0, "rewards/chosen": 4.544665336608887, "rewards/margins": -2.132133960723877, "rewards/rejected": 6.676799297332764, "step": 595 }, { "epoch": 0.13, "learning_rate": 9.966314981307261e-06, "logits/chosen": -1.3431459665298462, "logits/rejected": -1.0923829078674316, "logps/chosen": -164.4987335205078, "logps/rejected": -50.868072509765625, "loss": 0.1451, "rewards/accuracies": 1.0, "rewards/chosen": 5.890162944793701, "rewards/margins": 1.0969576835632324, "rewards/rejected": 4.793205261230469, "step": 596 }, { "epoch": 0.13, "learning_rate": 9.96610696380633e-06, "logits/chosen": -1.0483441352844238, "logits/rejected": -0.9189826846122742, "logps/chosen": -115.70851135253906, "logps/rejected": -67.07191467285156, "loss": 0.2314, "rewards/accuracies": 1.0, "rewards/chosen": 2.6677443981170654, "rewards/margins": 0.53767991065979, "rewards/rejected": 2.1300644874572754, "step": 597 }, { "epoch": 0.13, "learning_rate": 9.965898308172589e-06, "logits/chosen": -0.9494304060935974, "logits/rejected": -0.8969076871871948, "logps/chosen": -45.80952072143555, "logps/rejected": -38.50404739379883, "loss": 0.4151, "rewards/accuracies": 0.0, "rewards/chosen": 1.9820507764816284, "rewards/margins": -0.17104065418243408, "rewards/rejected": 2.1530914306640625, "step": 598 }, { "epoch": 0.13, "learning_rate": 9.965689014432854e-06, "logits/chosen": -1.2514389753341675, "logits/rejected": -1.0738403797149658, "logps/chosen": -69.44527435302734, "logps/rejected": -42.06882095336914, "loss": 0.1203, "rewards/accuracies": 1.0, "rewards/chosen": 4.230501651763916, "rewards/margins": 3.673982620239258, "rewards/rejected": 0.5565189719200134, "step": 599 }, { "epoch": 0.13, "learning_rate": 9.965479082614019e-06, "logits/chosen": -1.0003645420074463, "logits/rejected": -0.967281699180603, "logps/chosen": -89.9124526977539, "logps/rejected": -74.95584106445312, "loss": 0.0734, "rewards/accuracies": 1.0, "rewards/chosen": 6.671113014221191, "rewards/margins": 3.8837716579437256, "rewards/rejected": 2.787341356277466, "step": 600 }, { "epoch": 0.13, "learning_rate": 9.965268512743058e-06, "logits/chosen": -1.2869150638580322, "logits/rejected": -1.1237589120864868, "logps/chosen": -70.35267639160156, "logps/rejected": -55.48200225830078, "loss": 0.6206, "rewards/accuracies": 0.0, "rewards/chosen": 2.0407912731170654, "rewards/margins": -0.11774992942810059, "rewards/rejected": 2.158541202545166, "step": 601 }, { "epoch": 0.13, "learning_rate": 9.965057304847029e-06, "logits/chosen": -1.2169512510299683, "logits/rejected": -1.1379315853118896, "logps/chosen": -52.77320098876953, "logps/rejected": -43.385746002197266, "loss": 0.3615, "rewards/accuracies": 1.0, "rewards/chosen": 2.4648940563201904, "rewards/margins": 0.367781400680542, "rewards/rejected": 2.0971126556396484, "step": 602 }, { "epoch": 0.13, "learning_rate": 9.964845458953072e-06, "logits/chosen": -1.1827765703201294, "logits/rejected": -1.0335381031036377, "logps/chosen": -7.797544479370117, "logps/rejected": -170.77792358398438, "loss": 2.6569, "rewards/accuracies": 0.0, "rewards/chosen": 0.11559142917394638, "rewards/margins": -5.276998996734619, "rewards/rejected": 5.392590522766113, "step": 603 }, { "epoch": 0.13, "learning_rate": 9.964632975088408e-06, "logits/chosen": -1.04341459274292, "logits/rejected": -1.025041103363037, "logps/chosen": -36.905364990234375, "logps/rejected": -43.84458541870117, "loss": 0.387, "rewards/accuracies": 0.0, "rewards/chosen": 3.393268585205078, "rewards/margins": -0.14632916450500488, "rewards/rejected": 3.539597749710083, "step": 604 }, { "epoch": 0.13, "learning_rate": 9.964419853280343e-06, "logits/chosen": -1.1740245819091797, "logits/rejected": -1.3081660270690918, "logps/chosen": -113.88762664794922, "logps/rejected": -126.87309265136719, "loss": 1.8212, "rewards/accuracies": 0.0, "rewards/chosen": 5.400055885314941, "rewards/margins": -2.5574593544006348, "rewards/rejected": 7.957515239715576, "step": 605 }, { "epoch": 0.13, "learning_rate": 9.96420609355626e-06, "logits/chosen": -0.9848390221595764, "logits/rejected": -0.7589386701583862, "logps/chosen": -69.90298461914062, "logps/rejected": -16.042322158813477, "loss": 0.0602, "rewards/accuracies": 1.0, "rewards/chosen": 2.317978620529175, "rewards/margins": 2.139261245727539, "rewards/rejected": 0.17871741950511932, "step": 606 }, { "epoch": 0.13, "learning_rate": 9.963991695943627e-06, "logits/chosen": -1.3858892917633057, "logits/rejected": -1.0549912452697754, "logps/chosen": -123.46060180664062, "logps/rejected": -72.35623168945312, "loss": 0.354, "rewards/accuracies": 1.0, "rewards/chosen": 5.342309474945068, "rewards/margins": 6.07900857925415, "rewards/rejected": -0.7366989254951477, "step": 607 }, { "epoch": 0.13, "learning_rate": 9.963776660469996e-06, "logits/chosen": -1.0264986753463745, "logits/rejected": -1.1322510242462158, "logps/chosen": -60.038551330566406, "logps/rejected": -110.19911193847656, "loss": 1.4226, "rewards/accuracies": 0.0, "rewards/chosen": 1.6625274419784546, "rewards/margins": -1.896469235420227, "rewards/rejected": 3.5589966773986816, "step": 608 }, { "epoch": 0.13, "learning_rate": 9.963560987162994e-06, "logits/chosen": -1.3862894773483276, "logits/rejected": -1.0999091863632202, "logps/chosen": -118.34109497070312, "logps/rejected": -33.832794189453125, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": 5.208227634429932, "rewards/margins": 4.591418743133545, "rewards/rejected": 0.6168087124824524, "step": 609 }, { "epoch": 0.14, "learning_rate": 9.96334467605034e-06, "logits/chosen": -1.195613145828247, "logits/rejected": -1.2266697883605957, "logps/chosen": -68.9725570678711, "logps/rejected": -84.01042175292969, "loss": 0.3249, "rewards/accuracies": 1.0, "rewards/chosen": 2.6336936950683594, "rewards/margins": 0.0924994945526123, "rewards/rejected": 2.541194200515747, "step": 610 }, { "epoch": 0.14, "learning_rate": 9.963127727159825e-06, "logits/chosen": -1.1804851293563843, "logits/rejected": -1.201613426208496, "logps/chosen": -23.794647216796875, "logps/rejected": -64.16606903076172, "loss": 1.3013, "rewards/accuracies": 0.0, "rewards/chosen": 1.9527568817138672, "rewards/margins": -2.446009635925293, "rewards/rejected": 4.39876651763916, "step": 611 }, { "epoch": 0.14, "learning_rate": 9.962910140519328e-06, "logits/chosen": -1.3140993118286133, "logits/rejected": -1.3967530727386475, "logps/chosen": -58.222496032714844, "logps/rejected": -143.22056579589844, "loss": 1.068, "rewards/accuracies": 0.0, "rewards/chosen": 5.96111536026001, "rewards/margins": -1.9492087364196777, "rewards/rejected": 7.9103240966796875, "step": 612 }, { "epoch": 0.14, "learning_rate": 9.96269191615681e-06, "logits/chosen": -1.3843004703521729, "logits/rejected": -1.3843004703521729, "logps/chosen": -73.48023986816406, "logps/rejected": -73.48023986816406, "loss": 0.3518, "rewards/accuracies": 0.0, "rewards/chosen": 3.473813772201538, "rewards/margins": 0.0, "rewards/rejected": 3.473813772201538, "step": 613 }, { "epoch": 0.14, "learning_rate": 9.96247305410031e-06, "logits/chosen": -1.3744605779647827, "logits/rejected": -1.3969011306762695, "logps/chosen": -67.77482604980469, "logps/rejected": -51.48440170288086, "loss": 0.323, "rewards/accuracies": 1.0, "rewards/chosen": 3.5765304565429688, "rewards/margins": 0.10608935356140137, "rewards/rejected": 3.4704411029815674, "step": 614 }, { "epoch": 0.14, "learning_rate": 9.962253554377952e-06, "logits/chosen": -1.0699326992034912, "logits/rejected": -1.0692625045776367, "logps/chosen": -71.3394546508789, "logps/rejected": -38.240943908691406, "loss": 0.3818, "rewards/accuracies": 0.0, "rewards/chosen": 2.4142022132873535, "rewards/margins": -0.04303932189941406, "rewards/rejected": 2.4572415351867676, "step": 615 }, { "epoch": 0.14, "learning_rate": 9.96203341701794e-06, "logits/chosen": -0.8815270662307739, "logits/rejected": -0.8588244318962097, "logps/chosen": -27.494043350219727, "logps/rejected": -31.888160705566406, "loss": 2.8435, "rewards/accuracies": 0.0, "rewards/chosen": 0.3172933757305145, "rewards/margins": -0.1871614158153534, "rewards/rejected": 0.5044547915458679, "step": 616 }, { "epoch": 0.14, "learning_rate": 9.961812642048563e-06, "logits/chosen": -1.2894856929779053, "logits/rejected": -1.2151165008544922, "logps/chosen": -102.9820785522461, "logps/rejected": -63.195457458496094, "loss": 1.1027, "rewards/accuracies": 1.0, "rewards/chosen": 4.713947296142578, "rewards/margins": 1.7773070335388184, "rewards/rejected": 2.9366402626037598, "step": 617 }, { "epoch": 0.14, "learning_rate": 9.961591229498192e-06, "logits/chosen": -0.9311395883560181, "logits/rejected": -1.0600757598876953, "logps/chosen": -52.50666046142578, "logps/rejected": -59.05673599243164, "loss": 1.194, "rewards/accuracies": 0.0, "rewards/chosen": 1.6000900268554688, "rewards/margins": -1.7178096771240234, "rewards/rejected": 3.317899703979492, "step": 618 }, { "epoch": 0.14, "learning_rate": 9.96136917939527e-06, "logits/chosen": -1.0596128702163696, "logits/rejected": -0.8759519457817078, "logps/chosen": -70.37085723876953, "logps/rejected": -18.20486068725586, "loss": 0.6006, "rewards/accuracies": 1.0, "rewards/chosen": 0.8514259457588196, "rewards/margins": 0.734932541847229, "rewards/rejected": 0.11649341881275177, "step": 619 }, { "epoch": 0.14, "learning_rate": 9.961146491768338e-06, "logits/chosen": -1.1195464134216309, "logits/rejected": -0.9916089773178101, "logps/chosen": -110.82062530517578, "logps/rejected": -72.791259765625, "loss": 1.2155, "rewards/accuracies": 1.0, "rewards/chosen": 4.1927971839904785, "rewards/margins": 1.0074303150177002, "rewards/rejected": 3.1853668689727783, "step": 620 }, { "epoch": 0.14, "learning_rate": 9.96092316664601e-06, "logits/chosen": -1.5569725036621094, "logits/rejected": -1.4268138408660889, "logps/chosen": -140.26385498046875, "logps/rejected": -59.500091552734375, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 7.246217250823975, "rewards/margins": 4.96832275390625, "rewards/rejected": 2.2778947353363037, "step": 621 }, { "epoch": 0.14, "learning_rate": 9.960699204056978e-06, "logits/chosen": -1.039927363395691, "logits/rejected": -0.8920609354972839, "logps/chosen": -72.03984832763672, "logps/rejected": -16.042842864990234, "loss": 0.9941, "rewards/accuracies": 1.0, "rewards/chosen": 3.50761342048645, "rewards/margins": 2.8642821311950684, "rewards/rejected": 0.6433311700820923, "step": 622 }, { "epoch": 0.14, "learning_rate": 9.960474604030026e-06, "logits/chosen": -1.0723059177398682, "logits/rejected": -0.9941428899765015, "logps/chosen": -91.63407897949219, "logps/rejected": -47.88245391845703, "loss": 0.5062, "rewards/accuracies": 0.0, "rewards/chosen": 2.899395704269409, "rewards/margins": -0.4457817077636719, "rewards/rejected": 3.345177412033081, "step": 623 }, { "epoch": 0.14, "learning_rate": 9.96024936659401e-06, "logits/chosen": -1.1492501497268677, "logits/rejected": -1.028859257698059, "logps/chosen": -66.74345397949219, "logps/rejected": -21.24627685546875, "loss": 0.0981, "rewards/accuracies": 1.0, "rewards/chosen": 2.4544341564178467, "rewards/margins": 1.532380223274231, "rewards/rejected": 0.9220539331436157, "step": 624 }, { "epoch": 0.14, "learning_rate": 9.960023491777875e-06, "logits/chosen": -1.1040512323379517, "logits/rejected": -1.1040512323379517, "logps/chosen": -86.72298431396484, "logps/rejected": -86.72298431396484, "loss": 1.1615, "rewards/accuracies": 0.0, "rewards/chosen": 3.723349094390869, "rewards/margins": 0.0, "rewards/rejected": 3.723349094390869, "step": 625 }, { "epoch": 0.14, "learning_rate": 9.959796979610646e-06, "logits/chosen": -1.3080888986587524, "logits/rejected": -1.2226186990737915, "logps/chosen": -137.0139617919922, "logps/rejected": -86.95228576660156, "loss": 0.235, "rewards/accuracies": 1.0, "rewards/chosen": 4.632447719573975, "rewards/margins": 0.672600507736206, "rewards/rejected": 3.9598472118377686, "step": 626 }, { "epoch": 0.14, "learning_rate": 9.959569830121427e-06, "logits/chosen": -1.343255877494812, "logits/rejected": -1.23331880569458, "logps/chosen": -41.36071014404297, "logps/rejected": -39.371341705322266, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 2.5600929260253906, "rewards/margins": 0.557823896408081, "rewards/rejected": 2.0022690296173096, "step": 627 }, { "epoch": 0.14, "learning_rate": 9.959342043339406e-06, "logits/chosen": -1.1535333395004272, "logits/rejected": -0.9238047003746033, "logps/chosen": -61.79963302612305, "logps/rejected": -20.027921676635742, "loss": 0.6284, "rewards/accuracies": 1.0, "rewards/chosen": 1.7234219312667847, "rewards/margins": 1.214048147201538, "rewards/rejected": 0.5093738436698914, "step": 628 }, { "epoch": 0.14, "learning_rate": 9.959113619293857e-06, "logits/chosen": -0.7275404334068298, "logits/rejected": -0.6754096150398254, "logps/chosen": -56.232322692871094, "logps/rejected": -32.89015197753906, "loss": 0.8049, "rewards/accuracies": 0.0, "rewards/chosen": 0.9827491641044617, "rewards/margins": -1.3461406230926514, "rewards/rejected": 2.328889846801758, "step": 629 }, { "epoch": 0.14, "learning_rate": 9.958884558014128e-06, "logits/chosen": -1.0530403852462769, "logits/rejected": -1.2087976932525635, "logps/chosen": -67.54501342773438, "logps/rejected": -123.45967864990234, "loss": 2.8077, "rewards/accuracies": 0.0, "rewards/chosen": 3.7575714588165283, "rewards/margins": -5.606461524963379, "rewards/rejected": 9.364032745361328, "step": 630 }, { "epoch": 0.14, "learning_rate": 9.958654859529654e-06, "logits/chosen": -0.9977337121963501, "logits/rejected": -0.9347038269042969, "logps/chosen": -90.70924377441406, "logps/rejected": -98.8297348022461, "loss": 1.2939, "rewards/accuracies": 1.0, "rewards/chosen": 5.957086086273193, "rewards/margins": 0.7223715782165527, "rewards/rejected": 5.234714508056641, "step": 631 }, { "epoch": 0.14, "learning_rate": 9.958424523869952e-06, "logits/chosen": -1.3130013942718506, "logits/rejected": -0.9395425319671631, "logps/chosen": -136.2090606689453, "logps/rejected": -39.95310974121094, "loss": 0.6084, "rewards/accuracies": 1.0, "rewards/chosen": 7.6246337890625, "rewards/margins": 6.014062881469727, "rewards/rejected": 1.6105709075927734, "step": 632 }, { "epoch": 0.14, "learning_rate": 9.958193551064617e-06, "logits/chosen": -1.0225762128829956, "logits/rejected": -1.0128161907196045, "logps/chosen": -68.57991027832031, "logps/rejected": -86.66812133789062, "loss": 0.3617, "rewards/accuracies": 1.0, "rewards/chosen": 1.8025788068771362, "rewards/margins": 0.03683328628540039, "rewards/rejected": 1.7657455205917358, "step": 633 }, { "epoch": 0.14, "learning_rate": 9.95796194114333e-06, "logits/chosen": -1.127359390258789, "logits/rejected": -1.0581351518630981, "logps/chosen": -113.71224212646484, "logps/rejected": -49.907135009765625, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 6.759192943572998, "rewards/margins": 3.9820353984832764, "rewards/rejected": 2.7771575450897217, "step": 634 }, { "epoch": 0.14, "learning_rate": 9.957729694135852e-06, "logits/chosen": -1.2364306449890137, "logits/rejected": -1.2372355461120605, "logps/chosen": -77.2925033569336, "logps/rejected": -55.14326477050781, "loss": 1.6655, "rewards/accuracies": 0.0, "rewards/chosen": 1.582079291343689, "rewards/margins": -0.3622368574142456, "rewards/rejected": 1.9443161487579346, "step": 635 }, { "epoch": 0.14, "learning_rate": 9.957496810072027e-06, "logits/chosen": -1.486163854598999, "logits/rejected": -1.344456672668457, "logps/chosen": -71.76727294921875, "logps/rejected": -28.208553314208984, "loss": 0.1639, "rewards/accuracies": 1.0, "rewards/chosen": 1.531427025794983, "rewards/margins": 0.9880035519599915, "rewards/rejected": 0.5434234738349915, "step": 636 }, { "epoch": 0.14, "learning_rate": 9.957263288981779e-06, "logits/chosen": -0.9820107221603394, "logits/rejected": -1.054787039756775, "logps/chosen": -16.73314666748047, "logps/rejected": -27.67054557800293, "loss": 0.9211, "rewards/accuracies": 0.0, "rewards/chosen": 1.5212697982788086, "rewards/margins": -0.7677772045135498, "rewards/rejected": 2.2890470027923584, "step": 637 }, { "epoch": 0.14, "learning_rate": 9.957029130895116e-06, "logits/chosen": -0.8246098756790161, "logits/rejected": -0.9779818058013916, "logps/chosen": -47.36240768432617, "logps/rejected": -109.39952087402344, "loss": 1.907, "rewards/accuracies": 0.0, "rewards/chosen": 2.28056001663208, "rewards/margins": -3.307460308074951, "rewards/rejected": 5.588020324707031, "step": 638 }, { "epoch": 0.14, "learning_rate": 9.956794335842126e-06, "logits/chosen": -0.9667169451713562, "logits/rejected": -0.9146294593811035, "logps/chosen": -96.2650146484375, "logps/rejected": -72.90772247314453, "loss": 1.0419, "rewards/accuracies": 0.0, "rewards/chosen": 3.0687997341156006, "rewards/margins": -1.8532278537750244, "rewards/rejected": 4.922027587890625, "step": 639 }, { "epoch": 0.14, "learning_rate": 9.956558903852978e-06, "logits/chosen": -1.0051970481872559, "logits/rejected": -0.9733060598373413, "logps/chosen": -50.0858154296875, "logps/rejected": -74.94982147216797, "loss": 0.1988, "rewards/accuracies": 1.0, "rewards/chosen": 3.4395523071289062, "rewards/margins": 1.0006613731384277, "rewards/rejected": 2.4388909339904785, "step": 640 }, { "epoch": 0.14, "learning_rate": 9.956322834957929e-06, "logits/chosen": -1.1120227575302124, "logits/rejected": -1.2463650703430176, "logps/chosen": -128.0408477783203, "logps/rejected": -103.71902465820312, "loss": 2.7667, "rewards/accuracies": 0.0, "rewards/chosen": 3.0379090309143066, "rewards/margins": -5.486410617828369, "rewards/rejected": 8.524319648742676, "step": 641 }, { "epoch": 0.14, "learning_rate": 9.956086129187308e-06, "logits/chosen": -0.7538617253303528, "logits/rejected": -0.7538617253303528, "logps/chosen": -26.910995483398438, "logps/rejected": -26.910995483398438, "loss": 1.7916, "rewards/accuracies": 0.0, "rewards/chosen": 2.1984291076660156, "rewards/margins": 0.0, "rewards/rejected": 2.1984291076660156, "step": 642 }, { "epoch": 0.14, "learning_rate": 9.955848786571534e-06, "logits/chosen": -1.1369402408599854, "logits/rejected": -1.1201465129852295, "logps/chosen": -76.9760513305664, "logps/rejected": -92.87105560302734, "loss": 0.2064, "rewards/accuracies": 1.0, "rewards/chosen": 2.9001731872558594, "rewards/margins": 0.7271468639373779, "rewards/rejected": 2.1730263233184814, "step": 643 }, { "epoch": 0.14, "learning_rate": 9.955610807141105e-06, "logits/chosen": -1.0924406051635742, "logits/rejected": -1.083847999572754, "logps/chosen": -83.85772705078125, "logps/rejected": -95.58110046386719, "loss": 1.959, "rewards/accuracies": 0.0, "rewards/chosen": 2.683786153793335, "rewards/margins": -1.5184080600738525, "rewards/rejected": 4.2021942138671875, "step": 644 }, { "epoch": 0.14, "learning_rate": 9.9553721909266e-06, "logits/chosen": -1.301796793937683, "logits/rejected": -1.4650465250015259, "logps/chosen": -73.71622467041016, "logps/rejected": -112.56731414794922, "loss": 2.8511, "rewards/accuracies": 0.0, "rewards/chosen": 1.7276848554611206, "rewards/margins": -4.634893894195557, "rewards/rejected": 6.362578868865967, "step": 645 }, { "epoch": 0.14, "learning_rate": 9.95513293795868e-06, "logits/chosen": -1.0419424772262573, "logits/rejected": -1.0419424772262573, "logps/chosen": -30.50048828125, "logps/rejected": -30.50048828125, "loss": 0.3776, "rewards/accuracies": 0.0, "rewards/chosen": 2.095576047897339, "rewards/margins": 0.0, "rewards/rejected": 2.095576047897339, "step": 646 }, { "epoch": 0.14, "learning_rate": 9.95489304826809e-06, "logits/chosen": -1.3972105979919434, "logits/rejected": -1.2598156929016113, "logps/chosen": -82.51436614990234, "logps/rejected": -46.71862030029297, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": 5.764145851135254, "rewards/margins": 2.2544236183166504, "rewards/rejected": 3.5097222328186035, "step": 647 }, { "epoch": 0.14, "learning_rate": 9.954652521885656e-06, "logits/chosen": -1.165305733680725, "logits/rejected": -1.3743842840194702, "logps/chosen": -51.380271911621094, "logps/rejected": -137.32005310058594, "loss": 3.2342, "rewards/accuracies": 0.0, "rewards/chosen": 2.0387063026428223, "rewards/margins": -4.149313926696777, "rewards/rejected": 6.1880202293396, "step": 648 }, { "epoch": 0.14, "learning_rate": 9.954411358842282e-06, "logits/chosen": -1.3156031370162964, "logits/rejected": -1.2244879007339478, "logps/chosen": -234.80490112304688, "logps/rejected": -86.3525390625, "loss": 0.2656, "rewards/accuracies": 1.0, "rewards/chosen": 6.004703044891357, "rewards/margins": 0.6889543533325195, "rewards/rejected": 5.315748691558838, "step": 649 }, { "epoch": 0.14, "learning_rate": 9.954169559168958e-06, "logits/chosen": -1.6112252473831177, "logits/rejected": -1.401909351348877, "logps/chosen": -91.53729248046875, "logps/rejected": -23.848901748657227, "loss": 0.5579, "rewards/accuracies": 1.0, "rewards/chosen": 5.267062664031982, "rewards/margins": 4.609935760498047, "rewards/rejected": 0.6571270227432251, "step": 650 }, { "epoch": 0.14, "learning_rate": 9.953927122896756e-06, "logits/chosen": -1.186590313911438, "logits/rejected": -1.051351547241211, "logps/chosen": -114.36822509765625, "logps/rejected": -49.41801071166992, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 4.974920749664307, "rewards/margins": 3.454538345336914, "rewards/rejected": 1.520382285118103, "step": 651 }, { "epoch": 0.14, "learning_rate": 9.953684050056827e-06, "logits/chosen": -1.108567476272583, "logits/rejected": -0.9492206573486328, "logps/chosen": -54.989295959472656, "logps/rejected": -26.94692611694336, "loss": 1.0701, "rewards/accuracies": 1.0, "rewards/chosen": 2.572716474533081, "rewards/margins": 2.2363665103912354, "rewards/rejected": 0.33635005354881287, "step": 652 }, { "epoch": 0.14, "learning_rate": 9.953440340680407e-06, "logits/chosen": -1.3157341480255127, "logits/rejected": -1.1562632322311401, "logps/chosen": -158.19139099121094, "logps/rejected": -78.23274993896484, "loss": 0.0788, "rewards/accuracies": 1.0, "rewards/chosen": 6.969047546386719, "rewards/margins": 3.5710508823394775, "rewards/rejected": 3.397996664047241, "step": 653 }, { "epoch": 0.14, "learning_rate": 9.95319599479881e-06, "logits/chosen": -0.681500256061554, "logits/rejected": -0.4761806130409241, "logps/chosen": -51.99982452392578, "logps/rejected": -3.040365219116211, "loss": 0.4565, "rewards/accuracies": 1.0, "rewards/chosen": 2.290961503982544, "rewards/margins": 1.402588963508606, "rewards/rejected": 0.888372540473938, "step": 654 }, { "epoch": 0.14, "learning_rate": 9.952951012443434e-06, "logits/chosen": -0.8295598030090332, "logits/rejected": -0.6071867942810059, "logps/chosen": -78.81321716308594, "logps/rejected": -44.348812103271484, "loss": 0.8982, "rewards/accuracies": 1.0, "rewards/chosen": 2.979499101638794, "rewards/margins": 2.5944461822509766, "rewards/rejected": 0.385052889585495, "step": 655 }, { "epoch": 0.15, "learning_rate": 9.952705393645761e-06, "logits/chosen": -1.250331163406372, "logits/rejected": -1.1262093782424927, "logps/chosen": -93.99662017822266, "logps/rejected": -87.62357330322266, "loss": 0.2225, "rewards/accuracies": 1.0, "rewards/chosen": 2.731464385986328, "rewards/margins": 0.6050474643707275, "rewards/rejected": 2.1264169216156006, "step": 656 }, { "epoch": 0.15, "learning_rate": 9.952459138437352e-06, "logits/chosen": -1.1205666065216064, "logits/rejected": -1.1256086826324463, "logps/chosen": -91.59378051757812, "logps/rejected": -100.60260009765625, "loss": 3.1564, "rewards/accuracies": 0.0, "rewards/chosen": 4.265194892883301, "rewards/margins": -4.841578483581543, "rewards/rejected": 9.106773376464844, "step": 657 }, { "epoch": 0.15, "learning_rate": 9.952212246849847e-06, "logits/chosen": -1.0677114725112915, "logits/rejected": -1.0740944147109985, "logps/chosen": -56.20384979248047, "logps/rejected": -58.11495590209961, "loss": 3.4439, "rewards/accuracies": 0.0, "rewards/chosen": 0.989374577999115, "rewards/margins": -2.69323992729187, "rewards/rejected": 3.68261456489563, "step": 658 }, { "epoch": 0.15, "learning_rate": 9.951964718914972e-06, "logits/chosen": -1.4336942434310913, "logits/rejected": -1.3675503730773926, "logps/chosen": -143.44082641601562, "logps/rejected": -141.51016235351562, "loss": 0.6063, "rewards/accuracies": 0.0, "rewards/chosen": 5.003668308258057, "rewards/margins": -0.8542327880859375, "rewards/rejected": 5.857901096343994, "step": 659 }, { "epoch": 0.15, "learning_rate": 9.951716554664537e-06, "logits/chosen": -0.9348170161247253, "logits/rejected": -0.8669443130493164, "logps/chosen": -53.433807373046875, "logps/rejected": -63.565879821777344, "loss": 0.3139, "rewards/accuracies": 1.0, "rewards/chosen": 2.515514373779297, "rewards/margins": 0.20892333984375, "rewards/rejected": 2.306591033935547, "step": 660 }, { "epoch": 0.15, "learning_rate": 9.951467754130429e-06, "logits/chosen": -0.9035085439682007, "logits/rejected": -0.8395532369613647, "logps/chosen": -26.886287689208984, "logps/rejected": -42.633121490478516, "loss": 0.3426, "rewards/accuracies": 1.0, "rewards/chosen": 1.5362865924835205, "rewards/margins": 0.018360137939453125, "rewards/rejected": 1.5179264545440674, "step": 661 }, { "epoch": 0.15, "learning_rate": 9.951218317344615e-06, "logits/chosen": -0.9612313508987427, "logits/rejected": -0.9612313508987427, "logps/chosen": -47.52599334716797, "logps/rejected": -47.52599334716797, "loss": 1.2135, "rewards/accuracies": 0.0, "rewards/chosen": 2.9898643493652344, "rewards/margins": 0.0, "rewards/rejected": 2.9898643493652344, "step": 662 }, { "epoch": 0.15, "learning_rate": 9.950968244339152e-06, "logits/chosen": -0.8430954813957214, "logits/rejected": -0.7374628186225891, "logps/chosen": -51.12308120727539, "logps/rejected": -39.02394485473633, "loss": 0.175, "rewards/accuracies": 1.0, "rewards/chosen": 3.4020848274230957, "rewards/margins": 0.9527699947357178, "rewards/rejected": 2.449314832687378, "step": 663 }, { "epoch": 0.15, "learning_rate": 9.95071753514617e-06, "logits/chosen": -0.8036304712295532, "logits/rejected": -0.9419063925743103, "logps/chosen": -51.302947998046875, "logps/rejected": -115.46924591064453, "loss": 1.9889, "rewards/accuracies": 0.0, "rewards/chosen": 2.5874946117401123, "rewards/margins": -2.792550802230835, "rewards/rejected": 5.380045413970947, "step": 664 }, { "epoch": 0.15, "learning_rate": 9.950466189797885e-06, "logits/chosen": -1.660449743270874, "logits/rejected": -1.6039977073669434, "logps/chosen": -40.04412078857422, "logps/rejected": -35.530181884765625, "loss": 0.1425, "rewards/accuracies": 1.0, "rewards/chosen": 1.8976963758468628, "rewards/margins": 1.30454421043396, "rewards/rejected": 0.5931522250175476, "step": 665 }, { "epoch": 0.15, "learning_rate": 9.950214208326598e-06, "logits/chosen": -1.02236008644104, "logits/rejected": -1.0193345546722412, "logps/chosen": -36.93968963623047, "logps/rejected": -37.34523010253906, "loss": 1.1595, "rewards/accuracies": 0.0, "rewards/chosen": 1.2867828607559204, "rewards/margins": -1.9894622564315796, "rewards/rejected": 3.2762451171875, "step": 666 }, { "epoch": 0.15, "learning_rate": 9.949961590764682e-06, "logits/chosen": -1.0696598291397095, "logits/rejected": -1.0696598291397095, "logps/chosen": -59.98164367675781, "logps/rejected": -59.98164367675781, "loss": 0.7484, "rewards/accuracies": 0.0, "rewards/chosen": 3.812901258468628, "rewards/margins": 0.0, "rewards/rejected": 3.812901258468628, "step": 667 }, { "epoch": 0.15, "learning_rate": 9.949708337144603e-06, "logits/chosen": -0.9616526961326599, "logits/rejected": -0.9090555310249329, "logps/chosen": -153.04202270507812, "logps/rejected": -50.76097106933594, "loss": 0.194, "rewards/accuracies": 1.0, "rewards/chosen": 5.715466499328613, "rewards/margins": 1.0416903495788574, "rewards/rejected": 4.673776149749756, "step": 668 }, { "epoch": 0.15, "learning_rate": 9.949454447498901e-06, "logits/chosen": -1.0309410095214844, "logits/rejected": -1.0309410095214844, "logps/chosen": -87.77435302734375, "logps/rejected": -87.77435302734375, "loss": 0.4855, "rewards/accuracies": 0.0, "rewards/chosen": 3.6953513622283936, "rewards/margins": 0.0, "rewards/rejected": 3.6953513622283936, "step": 669 }, { "epoch": 0.15, "learning_rate": 9.949199921860202e-06, "logits/chosen": -1.1041321754455566, "logits/rejected": -1.0860235691070557, "logps/chosen": -16.13333511352539, "logps/rejected": -34.54832458496094, "loss": 1.0502, "rewards/accuracies": 0.0, "rewards/chosen": 1.8115253448486328, "rewards/margins": -1.859053373336792, "rewards/rejected": 3.670578718185425, "step": 670 }, { "epoch": 0.15, "learning_rate": 9.94894476026121e-06, "logits/chosen": -1.097142219543457, "logits/rejected": -1.097142219543457, "logps/chosen": -54.85260772705078, "logps/rejected": -54.85260772705078, "loss": 0.4594, "rewards/accuracies": 0.0, "rewards/chosen": 3.010838270187378, "rewards/margins": 0.0, "rewards/rejected": 3.010838270187378, "step": 671 }, { "epoch": 0.15, "learning_rate": 9.948688962734711e-06, "logits/chosen": -0.8489903211593628, "logits/rejected": -0.7649847865104675, "logps/chosen": -61.941490173339844, "logps/rejected": -21.05259132385254, "loss": 0.6066, "rewards/accuracies": 0.0, "rewards/chosen": 1.4073807001113892, "rewards/margins": -0.17472052574157715, "rewards/rejected": 1.5821012258529663, "step": 672 }, { "epoch": 0.15, "learning_rate": 9.94843252931358e-06, "logits/chosen": -1.1772154569625854, "logits/rejected": -1.0555135011672974, "logps/chosen": -116.14846801757812, "logps/rejected": -62.81169128417969, "loss": 0.8438, "rewards/accuracies": 0.0, "rewards/chosen": 4.290945529937744, "rewards/margins": -0.8268251419067383, "rewards/rejected": 5.117770671844482, "step": 673 }, { "epoch": 0.15, "learning_rate": 9.948175460030762e-06, "logits/chosen": -1.0862250328063965, "logits/rejected": -0.950695812702179, "logps/chosen": -86.40037536621094, "logps/rejected": -75.5044937133789, "loss": 0.9347, "rewards/accuracies": 0.0, "rewards/chosen": 3.5061867237091064, "rewards/margins": -1.603916883468628, "rewards/rejected": 5.110103607177734, "step": 674 }, { "epoch": 0.15, "learning_rate": 9.947917754919293e-06, "logits/chosen": -0.979145348072052, "logits/rejected": -1.0519437789916992, "logps/chosen": -42.29158020019531, "logps/rejected": -67.49465942382812, "loss": 1.7565, "rewards/accuracies": 0.0, "rewards/chosen": 2.736288547515869, "rewards/margins": -0.5756874084472656, "rewards/rejected": 3.3119759559631348, "step": 675 }, { "epoch": 0.15, "learning_rate": 9.947659414012287e-06, "logits/chosen": -1.1467326879501343, "logits/rejected": -1.0438357591629028, "logps/chosen": -33.179386138916016, "logps/rejected": -4.629325866699219, "loss": 0.5473, "rewards/accuracies": 1.0, "rewards/chosen": 3.2554852962493896, "rewards/margins": 2.1247782707214355, "rewards/rejected": 1.1307071447372437, "step": 676 }, { "epoch": 0.15, "learning_rate": 9.94740043734294e-06, "logits/chosen": -1.025199294090271, "logits/rejected": -0.9896203875541687, "logps/chosen": -99.74143981933594, "logps/rejected": -64.43359375, "loss": 1.4269, "rewards/accuracies": 0.0, "rewards/chosen": 1.7423492670059204, "rewards/margins": -1.5340896844863892, "rewards/rejected": 3.2764389514923096, "step": 677 }, { "epoch": 0.15, "learning_rate": 9.947140824944533e-06, "logits/chosen": -0.8352534770965576, "logits/rejected": -0.7782428860664368, "logps/chosen": -69.43968963623047, "logps/rejected": -44.154327392578125, "loss": 0.1669, "rewards/accuracies": 1.0, "rewards/chosen": 3.1261932849884033, "rewards/margins": 0.9807829856872559, "rewards/rejected": 2.1454102993011475, "step": 678 }, { "epoch": 0.15, "learning_rate": 9.946880576850418e-06, "logits/chosen": -1.0744295120239258, "logits/rejected": -1.0746487379074097, "logps/chosen": -30.28821563720703, "logps/rejected": -66.56968688964844, "loss": 0.4937, "rewards/accuracies": 0.0, "rewards/chosen": 2.0300862789154053, "rewards/margins": -0.3529505729675293, "rewards/rejected": 2.3830368518829346, "step": 679 }, { "epoch": 0.15, "learning_rate": 9.946619693094044e-06, "logits/chosen": -1.0809814929962158, "logits/rejected": -1.1000816822052002, "logps/chosen": -63.87028121948242, "logps/rejected": -49.293365478515625, "loss": 0.9605, "rewards/accuracies": 0.0, "rewards/chosen": 1.6432751417160034, "rewards/margins": -0.996188759803772, "rewards/rejected": 2.6394639015197754, "step": 680 }, { "epoch": 0.15, "learning_rate": 9.94635817370893e-06, "logits/chosen": -1.1676348447799683, "logits/rejected": -1.1676348447799683, "logps/chosen": -194.3327178955078, "logps/rejected": -194.3327178955078, "loss": 0.3491, "rewards/accuracies": 0.0, "rewards/chosen": 7.585044860839844, "rewards/margins": 0.0, "rewards/rejected": 7.585044860839844, "step": 681 }, { "epoch": 0.15, "learning_rate": 9.94609601872868e-06, "logits/chosen": -0.9300614595413208, "logits/rejected": -0.9300614595413208, "logps/chosen": -70.94570922851562, "logps/rejected": -70.94570922851562, "loss": 0.588, "rewards/accuracies": 0.0, "rewards/chosen": 3.318424940109253, "rewards/margins": 0.0, "rewards/rejected": 3.318424940109253, "step": 682 }, { "epoch": 0.15, "learning_rate": 9.945833228186984e-06, "logits/chosen": -1.3008413314819336, "logits/rejected": -1.2154029607772827, "logps/chosen": -71.49203491210938, "logps/rejected": -203.89471435546875, "loss": 2.4802, "rewards/accuracies": 0.0, "rewards/chosen": 2.419442892074585, "rewards/margins": -4.373745918273926, "rewards/rejected": 6.793188571929932, "step": 683 }, { "epoch": 0.15, "learning_rate": 9.945569802117604e-06, "logits/chosen": -1.267975926399231, "logits/rejected": -1.3869377374649048, "logps/chosen": -170.11727905273438, "logps/rejected": -399.2829895019531, "loss": 4.2313, "rewards/accuracies": 0.0, "rewards/chosen": 6.582974433898926, "rewards/margins": -6.91009521484375, "rewards/rejected": 13.493069648742676, "step": 684 }, { "epoch": 0.15, "learning_rate": 9.945305740554397e-06, "logits/chosen": -1.252110481262207, "logits/rejected": -1.0670244693756104, "logps/chosen": -124.67073059082031, "logps/rejected": -73.46653747558594, "loss": 0.0596, "rewards/accuracies": 1.0, "rewards/chosen": 5.2538347244262695, "rewards/margins": 2.279020071029663, "rewards/rejected": 2.9748146533966064, "step": 685 }, { "epoch": 0.15, "learning_rate": 9.945041043531289e-06, "logits/chosen": -1.0453507900238037, "logits/rejected": -0.9726046919822693, "logps/chosen": -61.806861877441406, "logps/rejected": -38.059112548828125, "loss": 1.1476, "rewards/accuracies": 1.0, "rewards/chosen": 2.843229055404663, "rewards/margins": 0.46972060203552246, "rewards/rejected": 2.3735084533691406, "step": 686 }, { "epoch": 0.15, "learning_rate": 9.944775711082296e-06, "logits/chosen": -1.0158698558807373, "logits/rejected": -0.9009281992912292, "logps/chosen": -48.463539123535156, "logps/rejected": -34.9376220703125, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": 2.4982659816741943, "rewards/margins": 2.2932980060577393, "rewards/rejected": 0.20496788620948792, "step": 687 }, { "epoch": 0.15, "learning_rate": 9.944509743241508e-06, "logits/chosen": -0.8041713237762451, "logits/rejected": -0.7283424139022827, "logps/chosen": -37.85858154296875, "logps/rejected": -34.271366119384766, "loss": 0.5973, "rewards/accuracies": 1.0, "rewards/chosen": 2.9856889247894287, "rewards/margins": 0.9619901180267334, "rewards/rejected": 2.0236988067626953, "step": 688 }, { "epoch": 0.15, "learning_rate": 9.944243140043106e-06, "logits/chosen": -1.2547404766082764, "logits/rejected": -1.0886305570602417, "logps/chosen": -106.93284606933594, "logps/rejected": -78.87984466552734, "loss": 0.1602, "rewards/accuracies": 1.0, "rewards/chosen": 4.5623064041137695, "rewards/margins": 2.6571273803710938, "rewards/rejected": 1.9051789045333862, "step": 689 }, { "epoch": 0.15, "learning_rate": 9.943975901521347e-06, "logits/chosen": -1.2884496450424194, "logits/rejected": -1.3612446784973145, "logps/chosen": -169.71087646484375, "logps/rejected": -175.35789489746094, "loss": 0.7018, "rewards/accuracies": 0.0, "rewards/chosen": 6.075819492340088, "rewards/margins": -1.0275144577026367, "rewards/rejected": 7.103333950042725, "step": 690 }, { "epoch": 0.15, "learning_rate": 9.943708027710567e-06, "logits/chosen": -0.8248670101165771, "logits/rejected": -0.6652455925941467, "logps/chosen": -30.094173431396484, "logps/rejected": -8.644280433654785, "loss": 0.1601, "rewards/accuracies": 1.0, "rewards/chosen": 2.628068208694458, "rewards/margins": 1.8604565858840942, "rewards/rejected": 0.7676116228103638, "step": 691 }, { "epoch": 0.15, "learning_rate": 9.943439518645193e-06, "logits/chosen": -1.3296890258789062, "logits/rejected": -1.2590621709823608, "logps/chosen": -140.63058471679688, "logps/rejected": -80.78787231445312, "loss": 0.9071, "rewards/accuracies": 0.0, "rewards/chosen": 4.391223430633545, "rewards/margins": -1.5431442260742188, "rewards/rejected": 5.934367656707764, "step": 692 }, { "epoch": 0.15, "learning_rate": 9.943170374359722e-06, "logits/chosen": -1.4685454368591309, "logits/rejected": -1.3618528842926025, "logps/chosen": -151.1133575439453, "logps/rejected": -48.868953704833984, "loss": 0.4107, "rewards/accuracies": 1.0, "rewards/chosen": 6.943455696105957, "rewards/margins": 3.067214012145996, "rewards/rejected": 3.876241683959961, "step": 693 }, { "epoch": 0.15, "learning_rate": 9.942900594888743e-06, "logits/chosen": -1.364203691482544, "logits/rejected": -1.369724154472351, "logps/chosen": -93.11028289794922, "logps/rejected": -134.18502807617188, "loss": 1.3495, "rewards/accuracies": 0.0, "rewards/chosen": 2.0367157459259033, "rewards/margins": -0.5235092639923096, "rewards/rejected": 2.560225009918213, "step": 694 }, { "epoch": 0.15, "learning_rate": 9.94263018026692e-06, "logits/chosen": -1.1642141342163086, "logits/rejected": -1.148645043373108, "logps/chosen": -47.14295196533203, "logps/rejected": -92.08797454833984, "loss": 0.2916, "rewards/accuracies": 1.0, "rewards/chosen": 3.8156464099884033, "rewards/margins": 0.347745418548584, "rewards/rejected": 3.4679009914398193, "step": 695 }, { "epoch": 0.15, "learning_rate": 9.942359130528998e-06, "logits/chosen": -1.497630000114441, "logits/rejected": -1.3483649492263794, "logps/chosen": -70.85304260253906, "logps/rejected": -24.837316513061523, "loss": 0.3731, "rewards/accuracies": 1.0, "rewards/chosen": 2.436270236968994, "rewards/margins": 1.6409881114959717, "rewards/rejected": 0.7952821850776672, "step": 696 }, { "epoch": 0.15, "learning_rate": 9.942087445709811e-06, "logits/chosen": -1.2947183847427368, "logits/rejected": -1.194696068763733, "logps/chosen": -74.0036849975586, "logps/rejected": -49.277339935302734, "loss": 0.3089, "rewards/accuracies": 1.0, "rewards/chosen": 3.9967896938323975, "rewards/margins": 0.16335105895996094, "rewards/rejected": 3.8334386348724365, "step": 697 }, { "epoch": 0.15, "learning_rate": 9.941815125844267e-06, "logits/chosen": -1.068963646888733, "logits/rejected": -0.8653996586799622, "logps/chosen": -146.28683471679688, "logps/rejected": -61.63184356689453, "loss": 0.8574, "rewards/accuracies": 1.0, "rewards/chosen": 5.757571697235107, "rewards/margins": 4.33687686920166, "rewards/rejected": 1.4206947088241577, "step": 698 }, { "epoch": 0.15, "learning_rate": 9.94154217096736e-06, "logits/chosen": -1.2400574684143066, "logits/rejected": -1.0245028734207153, "logps/chosen": -79.59258270263672, "logps/rejected": -23.726051330566406, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 5.27681827545166, "rewards/margins": 4.355127334594727, "rewards/rejected": 0.9216911196708679, "step": 699 }, { "epoch": 0.15, "learning_rate": 9.941268581114162e-06, "logits/chosen": -0.9568173885345459, "logits/rejected": -0.903459906578064, "logps/chosen": -74.34432983398438, "logps/rejected": -46.040828704833984, "loss": 1.1739, "rewards/accuracies": 1.0, "rewards/chosen": 2.523486375808716, "rewards/margins": 0.18780779838562012, "rewards/rejected": 2.3356785774230957, "step": 700 }, { "epoch": 0.16, "learning_rate": 9.94099435631983e-06, "logits/chosen": -0.9461780786514282, "logits/rejected": -0.9595702290534973, "logps/chosen": -44.31385803222656, "logps/rejected": -54.72428894042969, "loss": 0.7735, "rewards/accuracies": 0.0, "rewards/chosen": 2.1921234130859375, "rewards/margins": -1.0411584377288818, "rewards/rejected": 3.2332818508148193, "step": 701 }, { "epoch": 0.16, "learning_rate": 9.940719496619601e-06, "logits/chosen": -1.2243596315383911, "logits/rejected": -1.106747031211853, "logps/chosen": -55.85447692871094, "logps/rejected": -24.527162551879883, "loss": 1.9131, "rewards/accuracies": 1.0, "rewards/chosen": 4.299771308898926, "rewards/margins": 2.9046106338500977, "rewards/rejected": 1.3951605558395386, "step": 702 }, { "epoch": 0.16, "learning_rate": 9.940444002048794e-06, "logits/chosen": -0.9967325925827026, "logits/rejected": -0.9967325925827026, "logps/chosen": -40.37560272216797, "logps/rejected": -40.37560272216797, "loss": 0.3774, "rewards/accuracies": 0.0, "rewards/chosen": 0.9810172915458679, "rewards/margins": 0.0, "rewards/rejected": 0.9810172915458679, "step": 703 }, { "epoch": 0.16, "learning_rate": 9.94016787264281e-06, "logits/chosen": -1.3422473669052124, "logits/rejected": -1.283324956893921, "logps/chosen": -69.01963806152344, "logps/rejected": -70.92234802246094, "loss": 0.4528, "rewards/accuracies": 0.0, "rewards/chosen": 2.991230010986328, "rewards/margins": -0.38760924339294434, "rewards/rejected": 3.3788392543792725, "step": 704 }, { "epoch": 0.16, "learning_rate": 9.939891108437129e-06, "logits/chosen": -1.4818445444107056, "logits/rejected": -1.3483858108520508, "logps/chosen": -55.204673767089844, "logps/rejected": -32.26446533203125, "loss": 0.8733, "rewards/accuracies": 1.0, "rewards/chosen": 2.79441237449646, "rewards/margins": 2.247065782546997, "rewards/rejected": 0.5473465323448181, "step": 705 }, { "epoch": 0.16, "learning_rate": 9.939613709467317e-06, "logits/chosen": -1.1494486331939697, "logits/rejected": -1.1502361297607422, "logps/chosen": -67.7156982421875, "logps/rejected": -58.36157989501953, "loss": 0.9403, "rewards/accuracies": 0.0, "rewards/chosen": 3.5786819458007812, "rewards/margins": -1.6149377822875977, "rewards/rejected": 5.193619728088379, "step": 706 }, { "epoch": 0.16, "learning_rate": 9.939335675769017e-06, "logits/chosen": -1.2083951234817505, "logits/rejected": -1.1169718503952026, "logps/chosen": -82.85686492919922, "logps/rejected": -44.873565673828125, "loss": 1.578, "rewards/accuracies": 1.0, "rewards/chosen": 4.997138500213623, "rewards/margins": 3.859527111053467, "rewards/rejected": 1.1376113891601562, "step": 707 }, { "epoch": 0.16, "learning_rate": 9.939057007377955e-06, "logits/chosen": -1.2143173217773438, "logits/rejected": -1.1405044794082642, "logps/chosen": -82.87340545654297, "logps/rejected": -45.648712158203125, "loss": 0.4311, "rewards/accuracies": 0.0, "rewards/chosen": 2.098445177078247, "rewards/margins": -0.31134033203125, "rewards/rejected": 2.409785509109497, "step": 708 }, { "epoch": 0.16, "learning_rate": 9.938777704329943e-06, "logits/chosen": -1.1627947092056274, "logits/rejected": -1.1523797512054443, "logps/chosen": -99.11790466308594, "logps/rejected": -140.55441284179688, "loss": 3.4742, "rewards/accuracies": 0.0, "rewards/chosen": 2.150595188140869, "rewards/margins": -5.711850166320801, "rewards/rejected": 7.86244535446167, "step": 709 }, { "epoch": 0.16, "learning_rate": 9.938497766660869e-06, "logits/chosen": -0.8992369174957275, "logits/rejected": -0.7328376770019531, "logps/chosen": -95.40898132324219, "logps/rejected": -37.036354064941406, "loss": 0.2257, "rewards/accuracies": 1.0, "rewards/chosen": 4.225230693817139, "rewards/margins": 0.6765444278717041, "rewards/rejected": 3.5486862659454346, "step": 710 }, { "epoch": 0.16, "learning_rate": 9.938217194406701e-06, "logits/chosen": -0.9810398817062378, "logits/rejected": -0.8201474547386169, "logps/chosen": -68.39737701416016, "logps/rejected": -25.563793182373047, "loss": 0.1273, "rewards/accuracies": 1.0, "rewards/chosen": 1.5914955139160156, "rewards/margins": 1.239161491394043, "rewards/rejected": 0.35233402252197266, "step": 711 }, { "epoch": 0.16, "learning_rate": 9.937935987603497e-06, "logits/chosen": -0.9491848945617676, "logits/rejected": -0.7720983624458313, "logps/chosen": -43.897361755371094, "logps/rejected": -53.32497024536133, "loss": 0.0646, "rewards/accuracies": 1.0, "rewards/chosen": 3.7277534008026123, "rewards/margins": 2.2997865676879883, "rewards/rejected": 1.4279667139053345, "step": 712 }, { "epoch": 0.16, "learning_rate": 9.937654146287388e-06, "logits/chosen": -0.8577443361282349, "logits/rejected": -0.8577443361282349, "logps/chosen": -45.024784088134766, "logps/rejected": -45.024784088134766, "loss": 1.9651, "rewards/accuracies": 0.0, "rewards/chosen": 3.1838512420654297, "rewards/margins": 0.0, "rewards/rejected": 3.1838512420654297, "step": 713 }, { "epoch": 0.16, "learning_rate": 9.937371670494591e-06, "logits/chosen": -1.498766541481018, "logits/rejected": -1.4085373878479004, "logps/chosen": -63.685508728027344, "logps/rejected": -39.79914093017578, "loss": 0.4336, "rewards/accuracies": 1.0, "rewards/chosen": 5.687348365783691, "rewards/margins": 3.0820724964141846, "rewards/rejected": 2.605275869369507, "step": 714 }, { "epoch": 0.16, "learning_rate": 9.937088560261404e-06, "logits/chosen": -1.2320964336395264, "logits/rejected": -1.144097924232483, "logps/chosen": -120.97796630859375, "logps/rejected": -80.51773071289062, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": 6.925000190734863, "rewards/margins": 3.1014695167541504, "rewards/rejected": 3.823530673980713, "step": 715 }, { "epoch": 0.16, "learning_rate": 9.936804815624205e-06, "logits/chosen": -1.1197915077209473, "logits/rejected": -1.0559186935424805, "logps/chosen": -45.29258728027344, "logps/rejected": -79.5083999633789, "loss": 0.3013, "rewards/accuracies": 1.0, "rewards/chosen": 1.5357986688613892, "rewards/margins": 0.22323763370513916, "rewards/rejected": 1.31256103515625, "step": 716 }, { "epoch": 0.16, "learning_rate": 9.936520436619455e-06, "logits/chosen": -1.1680818796157837, "logits/rejected": -1.172876000404358, "logps/chosen": -77.53905487060547, "logps/rejected": -60.79069137573242, "loss": 1.11, "rewards/accuracies": 0.0, "rewards/chosen": 1.7805969715118408, "rewards/margins": -0.49745893478393555, "rewards/rejected": 2.2780559062957764, "step": 717 }, { "epoch": 0.16, "learning_rate": 9.936235423283696e-06, "logits/chosen": -1.3120262622833252, "logits/rejected": -1.0681501626968384, "logps/chosen": -117.98078918457031, "logps/rejected": -81.83302307128906, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 6.289477825164795, "rewards/margins": 3.3204615116119385, "rewards/rejected": 2.9690163135528564, "step": 718 }, { "epoch": 0.16, "learning_rate": 9.935949775653554e-06, "logits/chosen": -1.0550572872161865, "logits/rejected": -1.0550572872161865, "logps/chosen": -94.23482513427734, "logps/rejected": -94.23482513427734, "loss": 0.623, "rewards/accuracies": 0.0, "rewards/chosen": 4.0656657218933105, "rewards/margins": 0.0, "rewards/rejected": 4.0656657218933105, "step": 719 }, { "epoch": 0.16, "learning_rate": 9.935663493765726e-06, "logits/chosen": -1.2590608596801758, "logits/rejected": -1.1751645803451538, "logps/chosen": -71.76774597167969, "logps/rejected": -48.804203033447266, "loss": 0.2567, "rewards/accuracies": 1.0, "rewards/chosen": 2.260117292404175, "rewards/margins": 0.4847530126571655, "rewards/rejected": 1.7753642797470093, "step": 720 }, { "epoch": 0.16, "learning_rate": 9.935376577657008e-06, "logits/chosen": -0.9155911207199097, "logits/rejected": -0.9155911207199097, "logps/chosen": -41.370384216308594, "logps/rejected": -41.370384216308594, "loss": 2.8558, "rewards/accuracies": 0.0, "rewards/chosen": 1.857627511024475, "rewards/margins": 0.0, "rewards/rejected": 1.857627511024475, "step": 721 }, { "epoch": 0.16, "learning_rate": 9.935089027364264e-06, "logits/chosen": -1.5207781791687012, "logits/rejected": -1.5421322584152222, "logps/chosen": -81.13186645507812, "logps/rejected": -38.86201477050781, "loss": 0.6976, "rewards/accuracies": 0.0, "rewards/chosen": 2.171962022781372, "rewards/margins": -0.9445013999938965, "rewards/rejected": 3.1164634227752686, "step": 722 }, { "epoch": 0.16, "learning_rate": 9.934800842924443e-06, "logits/chosen": -1.296777367591858, "logits/rejected": -1.3424537181854248, "logps/chosen": -34.28214645385742, "logps/rejected": -72.05229949951172, "loss": 1.3481, "rewards/accuracies": 1.0, "rewards/chosen": 2.168398380279541, "rewards/margins": 0.13814473152160645, "rewards/rejected": 2.0302536487579346, "step": 723 }, { "epoch": 0.16, "learning_rate": 9.934512024374577e-06, "logits/chosen": -1.51349937915802, "logits/rejected": -1.3339817523956299, "logps/chosen": -97.01091766357422, "logps/rejected": -83.77385711669922, "loss": 0.1078, "rewards/accuracies": 1.0, "rewards/chosen": 5.122966766357422, "rewards/margins": 2.643044948577881, "rewards/rejected": 2.479921817779541, "step": 724 }, { "epoch": 0.16, "learning_rate": 9.934222571751777e-06, "logits/chosen": -1.1488752365112305, "logits/rejected": -1.0826878547668457, "logps/chosen": -99.82994842529297, "logps/rejected": -45.943092346191406, "loss": 0.2696, "rewards/accuracies": 1.0, "rewards/chosen": 5.192940711975098, "rewards/margins": 1.3618204593658447, "rewards/rejected": 3.831120252609253, "step": 725 }, { "epoch": 0.16, "learning_rate": 9.933932485093239e-06, "logits/chosen": -1.137186884880066, "logits/rejected": -1.1598455905914307, "logps/chosen": -93.2737045288086, "logps/rejected": -147.7953338623047, "loss": 0.2492, "rewards/accuracies": 1.0, "rewards/chosen": 5.402109622955322, "rewards/margins": 0.48760128021240234, "rewards/rejected": 4.91450834274292, "step": 726 }, { "epoch": 0.16, "learning_rate": 9.933641764436237e-06, "logits/chosen": -1.3345168828964233, "logits/rejected": -1.3345168828964233, "logps/chosen": -34.578125, "logps/rejected": -34.578125, "loss": 86.5494, "rewards/accuracies": 0.0, "rewards/chosen": 2.960580587387085, "rewards/margins": 0.0, "rewards/rejected": 2.960580587387085, "step": 727 }, { "epoch": 0.16, "learning_rate": 9.933350409818128e-06, "logits/chosen": -1.0718039274215698, "logits/rejected": -0.9567539095878601, "logps/chosen": -128.10519409179688, "logps/rejected": -75.11814880371094, "loss": 0.7915, "rewards/accuracies": 1.0, "rewards/chosen": 6.981301784515381, "rewards/margins": 4.030485153198242, "rewards/rejected": 2.9508163928985596, "step": 728 }, { "epoch": 0.16, "learning_rate": 9.933058421276351e-06, "logits/chosen": -1.146175742149353, "logits/rejected": -1.0146386623382568, "logps/chosen": -65.72084045410156, "logps/rejected": -60.08900451660156, "loss": 0.8239, "rewards/accuracies": 1.0, "rewards/chosen": 2.107464551925659, "rewards/margins": 0.37871694564819336, "rewards/rejected": 1.7287476062774658, "step": 729 }, { "epoch": 0.16, "learning_rate": 9.932765798848428e-06, "logits/chosen": -0.8602401614189148, "logits/rejected": -0.7347365021705627, "logps/chosen": -41.9371337890625, "logps/rejected": -24.135364532470703, "loss": 0.0705, "rewards/accuracies": 1.0, "rewards/chosen": 3.0601747035980225, "rewards/margins": 2.1913983821868896, "rewards/rejected": 0.8687763214111328, "step": 730 }, { "epoch": 0.16, "learning_rate": 9.932472542571954e-06, "logits/chosen": -1.5471763610839844, "logits/rejected": -1.274757981300354, "logps/chosen": -149.5009765625, "logps/rejected": -81.95803833007812, "loss": 0.0983, "rewards/accuracies": 1.0, "rewards/chosen": 8.384332656860352, "rewards/margins": 5.426878929138184, "rewards/rejected": 2.957453966140747, "step": 731 }, { "epoch": 0.16, "learning_rate": 9.932178652484617e-06, "logits/chosen": -0.7580341696739197, "logits/rejected": -0.7580341696739197, "logps/chosen": -56.36252975463867, "logps/rejected": -56.36252975463867, "loss": 0.3924, "rewards/accuracies": 0.0, "rewards/chosen": 1.3160053491592407, "rewards/margins": 0.0, "rewards/rejected": 1.3160053491592407, "step": 732 }, { "epoch": 0.16, "learning_rate": 9.931884128624181e-06, "logits/chosen": -1.0464545488357544, "logits/rejected": -0.9474722743034363, "logps/chosen": -67.46989440917969, "logps/rejected": -21.084941864013672, "loss": 0.1121, "rewards/accuracies": 1.0, "rewards/chosen": 3.326242208480835, "rewards/margins": 1.5871132612228394, "rewards/rejected": 1.7391289472579956, "step": 733 }, { "epoch": 0.16, "learning_rate": 9.93158897102849e-06, "logits/chosen": -1.3752521276474, "logits/rejected": -1.1558870077133179, "logps/chosen": -197.7271728515625, "logps/rejected": -96.6595458984375, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": 6.73486328125, "rewards/margins": 4.7528076171875, "rewards/rejected": 1.9820556640625, "step": 734 }, { "epoch": 0.16, "learning_rate": 9.93129317973547e-06, "logits/chosen": -1.1441009044647217, "logits/rejected": -1.0509371757507324, "logps/chosen": -85.95225524902344, "logps/rejected": -133.3974609375, "loss": 0.2028, "rewards/accuracies": 1.0, "rewards/chosen": 5.35540771484375, "rewards/margins": 0.8567471504211426, "rewards/rejected": 4.498660564422607, "step": 735 }, { "epoch": 0.16, "learning_rate": 9.930996754783134e-06, "logits/chosen": -0.7305923104286194, "logits/rejected": -0.6428866982460022, "logps/chosen": -75.48172760009766, "logps/rejected": -79.30235290527344, "loss": 1.9366, "rewards/accuracies": 0.0, "rewards/chosen": 2.7273125648498535, "rewards/margins": -0.21298909187316895, "rewards/rejected": 2.9403016567230225, "step": 736 }, { "epoch": 0.16, "learning_rate": 9.930699696209566e-06, "logits/chosen": -2.368659019470215, "logits/rejected": -2.295947313308716, "logps/chosen": -55.15325164794922, "logps/rejected": -98.28488159179688, "loss": 0.7054, "rewards/accuracies": 0.0, "rewards/chosen": 1.655286431312561, "rewards/margins": -0.3916991949081421, "rewards/rejected": 2.046985626220703, "step": 737 }, { "epoch": 0.16, "learning_rate": 9.93040200405294e-06, "logits/chosen": -0.9048824310302734, "logits/rejected": -0.9162764549255371, "logps/chosen": -63.44672393798828, "logps/rejected": -72.497314453125, "loss": 1.0066, "rewards/accuracies": 0.0, "rewards/chosen": 2.2553412914276123, "rewards/margins": -1.701998233795166, "rewards/rejected": 3.9573395252227783, "step": 738 }, { "epoch": 0.16, "learning_rate": 9.930103678351511e-06, "logits/chosen": -1.3231241703033447, "logits/rejected": -1.2755868434906006, "logps/chosen": -25.454050064086914, "logps/rejected": -35.874786376953125, "loss": 0.1438, "rewards/accuracies": 1.0, "rewards/chosen": 2.827528953552246, "rewards/margins": 1.1391408443450928, "rewards/rejected": 1.6883881092071533, "step": 739 }, { "epoch": 0.16, "learning_rate": 9.92980471914361e-06, "logits/chosen": -1.224624752998352, "logits/rejected": -1.1610828638076782, "logps/chosen": -72.4240951538086, "logps/rejected": -90.46488952636719, "loss": 0.5343, "rewards/accuracies": 1.0, "rewards/chosen": 3.1519858837127686, "rewards/margins": 0.5443549156188965, "rewards/rejected": 2.607630968093872, "step": 740 }, { "epoch": 0.16, "learning_rate": 9.929505126467653e-06, "logits/chosen": -1.009894609451294, "logits/rejected": -0.9306454658508301, "logps/chosen": -50.739681243896484, "logps/rejected": -93.00096130371094, "loss": 1.5968, "rewards/accuracies": 1.0, "rewards/chosen": 2.8401753902435303, "rewards/margins": 0.9269551038742065, "rewards/rejected": 1.9132202863693237, "step": 741 }, { "epoch": 0.16, "learning_rate": 9.929204900362137e-06, "logits/chosen": -0.8479524254798889, "logits/rejected": -0.8486993312835693, "logps/chosen": -41.98844909667969, "logps/rejected": -46.944942474365234, "loss": 0.3696, "rewards/accuracies": 0.0, "rewards/chosen": 1.4237651824951172, "rewards/margins": -0.013797402381896973, "rewards/rejected": 1.4375625848770142, "step": 742 }, { "epoch": 0.16, "learning_rate": 9.928904040865642e-06, "logits/chosen": -1.0988459587097168, "logits/rejected": -0.999920666217804, "logps/chosen": -80.10035705566406, "logps/rejected": -50.69504165649414, "loss": 2.0363, "rewards/accuracies": 0.0, "rewards/chosen": 2.9163575172424316, "rewards/margins": -0.5353527069091797, "rewards/rejected": 3.4517102241516113, "step": 743 }, { "epoch": 0.16, "learning_rate": 9.928602548016826e-06, "logits/chosen": -1.1630678176879883, "logits/rejected": -1.2298120260238647, "logps/chosen": -60.925010681152344, "logps/rejected": -113.72576904296875, "loss": 1.4779, "rewards/accuracies": 0.0, "rewards/chosen": 2.8344483375549316, "rewards/margins": -2.876063346862793, "rewards/rejected": 5.710511684417725, "step": 744 }, { "epoch": 0.16, "learning_rate": 9.92830042185443e-06, "logits/chosen": -0.8387921452522278, "logits/rejected": -0.6853896975517273, "logps/chosen": -43.5110969543457, "logps/rejected": -27.564697265625, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 3.697448492050171, "rewards/margins": 3.554431676864624, "rewards/rejected": 0.14301681518554688, "step": 745 }, { "epoch": 0.17, "learning_rate": 9.927997662417277e-06, "logits/chosen": -1.1702086925506592, "logits/rejected": -1.007670521736145, "logps/chosen": -97.68426513671875, "logps/rejected": -84.76054382324219, "loss": 0.3652, "rewards/accuracies": 1.0, "rewards/chosen": 7.234976291656494, "rewards/margins": 3.2728638648986816, "rewards/rejected": 3.9621124267578125, "step": 746 }, { "epoch": 0.17, "learning_rate": 9.927694269744273e-06, "logits/chosen": -0.7951864004135132, "logits/rejected": -0.7951864004135132, "logps/chosen": -10.37332534790039, "logps/rejected": -10.37332534790039, "loss": 0.3485, "rewards/accuracies": 0.0, "rewards/chosen": 0.6139684915542603, "rewards/margins": 0.0, "rewards/rejected": 0.6139684915542603, "step": 747 }, { "epoch": 0.17, "learning_rate": 9.9273902438744e-06, "logits/chosen": -0.745244562625885, "logits/rejected": -0.9758890867233276, "logps/chosen": -56.44536590576172, "logps/rejected": -132.9433135986328, "loss": 1.5856, "rewards/accuracies": 0.0, "rewards/chosen": 3.085211992263794, "rewards/margins": -1.714519739151001, "rewards/rejected": 4.799731731414795, "step": 748 }, { "epoch": 0.17, "learning_rate": 9.927085584846725e-06, "logits/chosen": -1.2058069705963135, "logits/rejected": -1.1577540636062622, "logps/chosen": -107.71050262451172, "logps/rejected": -99.10425567626953, "loss": 2.1651, "rewards/accuracies": 0.0, "rewards/chosen": 3.342792510986328, "rewards/margins": -2.1892900466918945, "rewards/rejected": 5.532082557678223, "step": 749 }, { "epoch": 0.17, "learning_rate": 9.926780292700397e-06, "logits/chosen": -0.985777735710144, "logits/rejected": -0.8419991731643677, "logps/chosen": -40.5443115234375, "logps/rejected": -42.4002685546875, "loss": 0.3716, "rewards/accuracies": 0.0, "rewards/chosen": 2.845759630203247, "rewards/margins": -0.09193801879882812, "rewards/rejected": 2.937697649002075, "step": 750 }, { "epoch": 0.17, "learning_rate": 9.926474367474646e-06, "logits/chosen": -0.6105539202690125, "logits/rejected": -0.5372821688652039, "logps/chosen": -45.696258544921875, "logps/rejected": -62.61764144897461, "loss": 0.1267, "rewards/accuracies": 1.0, "rewards/chosen": 2.734813690185547, "rewards/margins": 1.2840381860733032, "rewards/rejected": 1.4507755041122437, "step": 751 }, { "epoch": 0.17, "learning_rate": 9.92616780920878e-06, "logits/chosen": -0.8525936603546143, "logits/rejected": -0.6139631271362305, "logps/chosen": -97.3034439086914, "logps/rejected": -12.0249605178833, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": 7.518874645233154, "rewards/margins": 7.149924278259277, "rewards/rejected": 0.3689502775669098, "step": 752 }, { "epoch": 0.17, "learning_rate": 9.925860617942195e-06, "logits/chosen": -1.0897849798202515, "logits/rejected": -1.0397450923919678, "logps/chosen": -76.6004867553711, "logps/rejected": -55.79871368408203, "loss": 0.508, "rewards/accuracies": 1.0, "rewards/chosen": 2.9075965881347656, "rewards/margins": 0.3744337558746338, "rewards/rejected": 2.533162832260132, "step": 753 }, { "epoch": 0.17, "learning_rate": 9.92555279371436e-06, "logits/chosen": -0.8291316628456116, "logits/rejected": -0.7464751601219177, "logps/chosen": -46.01941680908203, "logps/rejected": -28.445730209350586, "loss": 0.6019, "rewards/accuracies": 0.0, "rewards/chosen": 0.8338791131973267, "rewards/margins": -0.7854471206665039, "rewards/rejected": 1.6193262338638306, "step": 754 }, { "epoch": 0.17, "learning_rate": 9.925244336564831e-06, "logits/chosen": -1.1177325248718262, "logits/rejected": -1.1284593343734741, "logps/chosen": -148.0433349609375, "logps/rejected": -94.15401458740234, "loss": 1.0201, "rewards/accuracies": 0.0, "rewards/chosen": 5.5499420166015625, "rewards/margins": -0.5597848892211914, "rewards/rejected": 6.109726905822754, "step": 755 }, { "epoch": 0.17, "learning_rate": 9.924935246533249e-06, "logits/chosen": -0.9524804949760437, "logits/rejected": -1.0501869916915894, "logps/chosen": -59.24898910522461, "logps/rejected": -85.89143371582031, "loss": 0.3524, "rewards/accuracies": 1.0, "rewards/chosen": 2.831761598587036, "rewards/margins": 0.11010026931762695, "rewards/rejected": 2.721661329269409, "step": 756 }, { "epoch": 0.17, "learning_rate": 9.924625523659324e-06, "logits/chosen": -1.442804217338562, "logits/rejected": -1.3394404649734497, "logps/chosen": -49.821590423583984, "logps/rejected": -35.98875045776367, "loss": 0.5352, "rewards/accuracies": 0.0, "rewards/chosen": 2.808384418487549, "rewards/margins": -0.011469125747680664, "rewards/rejected": 2.8198535442352295, "step": 757 }, { "epoch": 0.17, "learning_rate": 9.924315167982858e-06, "logits/chosen": -1.073683261871338, "logits/rejected": -1.1050567626953125, "logps/chosen": -84.1148452758789, "logps/rejected": -60.146217346191406, "loss": 2.0305, "rewards/accuracies": 0.0, "rewards/chosen": 2.228102922439575, "rewards/margins": -3.1219255924224854, "rewards/rejected": 5.3500285148620605, "step": 758 }, { "epoch": 0.17, "learning_rate": 9.924004179543728e-06, "logits/chosen": -1.3428764343261719, "logits/rejected": -1.3045531511306763, "logps/chosen": -122.93916320800781, "logps/rejected": -122.40715026855469, "loss": 0.6031, "rewards/accuracies": 0.0, "rewards/chosen": 6.863409519195557, "rewards/margins": -0.8500199317932129, "rewards/rejected": 7.7134294509887695, "step": 759 }, { "epoch": 0.17, "learning_rate": 9.923692558381902e-06, "logits/chosen": -1.1026722192764282, "logits/rejected": -1.1026722192764282, "logps/chosen": -44.087615966796875, "logps/rejected": -44.087615966796875, "loss": 0.3616, "rewards/accuracies": 0.0, "rewards/chosen": 3.0352249145507812, "rewards/margins": 0.0, "rewards/rejected": 3.0352249145507812, "step": 760 }, { "epoch": 0.17, "learning_rate": 9.923380304537417e-06, "logits/chosen": -0.8433088064193726, "logits/rejected": -0.7866665720939636, "logps/chosen": -61.63124465942383, "logps/rejected": -64.59274291992188, "loss": 2.6054, "rewards/accuracies": 0.0, "rewards/chosen": 1.9323253631591797, "rewards/margins": -0.902207612991333, "rewards/rejected": 2.8345329761505127, "step": 761 }, { "epoch": 0.17, "learning_rate": 9.923067418050399e-06, "logits/chosen": -1.2858529090881348, "logits/rejected": -1.003495693206787, "logps/chosen": -110.02825927734375, "logps/rejected": -28.39944839477539, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 6.410729885101318, "rewards/margins": 5.6782121658325195, "rewards/rejected": 0.732517659664154, "step": 762 }, { "epoch": 0.17, "learning_rate": 9.922753898961052e-06, "logits/chosen": -1.116134524345398, "logits/rejected": -1.116134524345398, "logps/chosen": -20.00067138671875, "logps/rejected": -20.00067138671875, "loss": 0.5207, "rewards/accuracies": 0.0, "rewards/chosen": 1.79192054271698, "rewards/margins": 0.0, "rewards/rejected": 1.79192054271698, "step": 763 }, { "epoch": 0.17, "learning_rate": 9.922439747309663e-06, "logits/chosen": -1.3343673944473267, "logits/rejected": -1.172287106513977, "logps/chosen": -111.463134765625, "logps/rejected": -68.68254089355469, "loss": 0.197, "rewards/accuracies": 1.0, "rewards/chosen": 5.1022047996521, "rewards/margins": 2.292397975921631, "rewards/rejected": 2.8098068237304688, "step": 764 }, { "epoch": 0.17, "learning_rate": 9.922124963136599e-06, "logits/chosen": -1.0204075574874878, "logits/rejected": -0.9710829854011536, "logps/chosen": -21.61933708190918, "logps/rejected": -42.07854461669922, "loss": 0.5738, "rewards/accuracies": 1.0, "rewards/chosen": 2.2898473739624023, "rewards/margins": 0.3901025056838989, "rewards/rejected": 1.8997448682785034, "step": 765 }, { "epoch": 0.17, "learning_rate": 9.92180954648231e-06, "logits/chosen": -1.0337278842926025, "logits/rejected": -0.6110800504684448, "logps/chosen": -45.45208740234375, "logps/rejected": -89.849609375, "loss": 0.4527, "rewards/accuracies": 0.0, "rewards/chosen": 1.8656891584396362, "rewards/margins": -0.34321749210357666, "rewards/rejected": 2.208906650543213, "step": 766 }, { "epoch": 0.17, "learning_rate": 9.921493497387327e-06, "logits/chosen": -0.9188073873519897, "logits/rejected": -0.8402132987976074, "logps/chosen": -53.68244934082031, "logps/rejected": -43.08458709716797, "loss": 0.5847, "rewards/accuracies": 0.0, "rewards/chosen": 2.035902500152588, "rewards/margins": -0.47310781478881836, "rewards/rejected": 2.5090103149414062, "step": 767 }, { "epoch": 0.17, "learning_rate": 9.921176815892259e-06, "logits/chosen": -1.0948457717895508, "logits/rejected": -1.0704280138015747, "logps/chosen": -174.31741333007812, "logps/rejected": -104.28430938720703, "loss": 0.604, "rewards/accuracies": 1.0, "rewards/chosen": 6.497653484344482, "rewards/margins": 0.27863121032714844, "rewards/rejected": 6.219022274017334, "step": 768 }, { "epoch": 0.17, "learning_rate": 9.920859502037801e-06, "logits/chosen": -1.1892355680465698, "logits/rejected": -1.2059012651443481, "logps/chosen": -98.45512390136719, "logps/rejected": -110.58294677734375, "loss": 0.8534, "rewards/accuracies": 0.0, "rewards/chosen": 6.010112285614014, "rewards/margins": -1.504237174987793, "rewards/rejected": 7.514349460601807, "step": 769 }, { "epoch": 0.17, "learning_rate": 9.920541555864726e-06, "logits/chosen": -0.8821549415588379, "logits/rejected": -0.534591555595398, "logps/chosen": -117.78339385986328, "logps/rejected": -41.89849853515625, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 4.798827648162842, "rewards/margins": 4.025387763977051, "rewards/rejected": 0.7734398245811462, "step": 770 }, { "epoch": 0.17, "learning_rate": 9.920222977413892e-06, "logits/chosen": -1.1588042974472046, "logits/rejected": -1.046130895614624, "logps/chosen": -35.530574798583984, "logps/rejected": -52.42012023925781, "loss": 0.5174, "rewards/accuracies": 1.0, "rewards/chosen": 1.3628383874893188, "rewards/margins": 0.6006160974502563, "rewards/rejected": 0.7622222900390625, "step": 771 }, { "epoch": 0.17, "learning_rate": 9.919903766726229e-06, "logits/chosen": -1.3523523807525635, "logits/rejected": -1.2498401403427124, "logps/chosen": -41.03915023803711, "logps/rejected": -20.878692626953125, "loss": 0.3832, "rewards/accuracies": 1.0, "rewards/chosen": 2.78678560256958, "rewards/margins": 2.576711654663086, "rewards/rejected": 0.2100740522146225, "step": 772 }, { "epoch": 0.17, "learning_rate": 9.919583923842763e-06, "logits/chosen": -1.0269219875335693, "logits/rejected": -1.0150271654129028, "logps/chosen": -49.8466796875, "logps/rejected": -97.10944366455078, "loss": 2.9786, "rewards/accuracies": 0.0, "rewards/chosen": 3.137716770172119, "rewards/margins": -1.163680076599121, "rewards/rejected": 4.30139684677124, "step": 773 }, { "epoch": 0.17, "learning_rate": 9.919263448804589e-06, "logits/chosen": -1.3470710515975952, "logits/rejected": -1.4275370836257935, "logps/chosen": -56.466793060302734, "logps/rejected": -78.45276641845703, "loss": 2.6642, "rewards/accuracies": 0.0, "rewards/chosen": 2.1291096210479736, "rewards/margins": -5.114630699157715, "rewards/rejected": 7.243740081787109, "step": 774 }, { "epoch": 0.17, "learning_rate": 9.918942341652885e-06, "logits/chosen": -1.2848857641220093, "logits/rejected": -1.1026993989944458, "logps/chosen": -103.14876556396484, "logps/rejected": -75.41535949707031, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 4.951505184173584, "rewards/margins": 2.6182851791381836, "rewards/rejected": 2.3332200050354004, "step": 775 }, { "epoch": 0.17, "learning_rate": 9.918620602428916e-06, "logits/chosen": -0.9804434180259705, "logits/rejected": -0.8955866098403931, "logps/chosen": -60.947021484375, "logps/rejected": -49.598899841308594, "loss": 0.718, "rewards/accuracies": 0.0, "rewards/chosen": 1.284382700920105, "rewards/margins": -1.1634727716445923, "rewards/rejected": 2.4478554725646973, "step": 776 }, { "epoch": 0.17, "learning_rate": 9.918298231174023e-06, "logits/chosen": -0.9184123873710632, "logits/rejected": -0.7979583144187927, "logps/chosen": -54.31275939941406, "logps/rejected": -50.296531677246094, "loss": 0.1531, "rewards/accuracies": 1.0, "rewards/chosen": 3.453259229660034, "rewards/margins": 1.1307494640350342, "rewards/rejected": 2.322509765625, "step": 777 }, { "epoch": 0.17, "learning_rate": 9.917975227929631e-06, "logits/chosen": -1.1274468898773193, "logits/rejected": -1.2239434719085693, "logps/chosen": -92.88951873779297, "logps/rejected": -108.5950927734375, "loss": 1.0873, "rewards/accuracies": 0.0, "rewards/chosen": 4.684639930725098, "rewards/margins": -2.009091854095459, "rewards/rejected": 6.693731784820557, "step": 778 }, { "epoch": 0.17, "learning_rate": 9.917651592737245e-06, "logits/chosen": -0.8404490351676941, "logits/rejected": -0.8132061958312988, "logps/chosen": -69.87346649169922, "logps/rejected": -75.58377075195312, "loss": 2.0135, "rewards/accuracies": 0.0, "rewards/chosen": 2.266944169998169, "rewards/margins": -1.703650712966919, "rewards/rejected": 3.970594882965088, "step": 779 }, { "epoch": 0.17, "learning_rate": 9.91732732563845e-06, "logits/chosen": -1.2274062633514404, "logits/rejected": -1.1891573667526245, "logps/chosen": -39.69237518310547, "logps/rejected": -44.81759262084961, "loss": 0.4513, "rewards/accuracies": 1.0, "rewards/chosen": 2.7613778114318848, "rewards/margins": 1.339959740638733, "rewards/rejected": 1.4214180707931519, "step": 780 }, { "epoch": 0.17, "learning_rate": 9.917002426674916e-06, "logits/chosen": -0.941540539264679, "logits/rejected": -1.1551685333251953, "logps/chosen": -47.89369201660156, "logps/rejected": -107.4327163696289, "loss": 2.6576, "rewards/accuracies": 0.0, "rewards/chosen": 2.409796953201294, "rewards/margins": -5.220486640930176, "rewards/rejected": 7.630283355712891, "step": 781 }, { "epoch": 0.17, "learning_rate": 9.91667689588839e-06, "logits/chosen": -0.9082678556442261, "logits/rejected": -0.8147470951080322, "logps/chosen": -110.64958190917969, "logps/rejected": -98.43806457519531, "loss": 0.0586, "rewards/accuracies": 1.0, "rewards/chosen": 9.054205894470215, "rewards/margins": 3.474349021911621, "rewards/rejected": 5.579856872558594, "step": 782 }, { "epoch": 0.17, "learning_rate": 9.916350733320704e-06, "logits/chosen": -0.927085280418396, "logits/rejected": -0.9384278655052185, "logps/chosen": -29.206575393676758, "logps/rejected": -32.4793815612793, "loss": 0.5698, "rewards/accuracies": 0.0, "rewards/chosen": 2.8021795749664307, "rewards/margins": -0.06747770309448242, "rewards/rejected": 2.869657278060913, "step": 783 }, { "epoch": 0.17, "learning_rate": 9.916023939013764e-06, "logits/chosen": -0.9375673532485962, "logits/rejected": -0.9894335865974426, "logps/chosen": -76.12901306152344, "logps/rejected": -36.50364685058594, "loss": 0.6113, "rewards/accuracies": 0.0, "rewards/chosen": 3.782006025314331, "rewards/margins": -0.8520228862762451, "rewards/rejected": 4.634028911590576, "step": 784 }, { "epoch": 0.17, "learning_rate": 9.915696513009567e-06, "logits/chosen": -1.137076735496521, "logits/rejected": -1.0374356508255005, "logps/chosen": -44.460906982421875, "logps/rejected": -51.2581787109375, "loss": 0.7157, "rewards/accuracies": 0.0, "rewards/chosen": 2.0781311988830566, "rewards/margins": -1.116492509841919, "rewards/rejected": 3.1946237087249756, "step": 785 }, { "epoch": 0.17, "learning_rate": 9.915368455350185e-06, "logits/chosen": -0.8432281017303467, "logits/rejected": -0.8172407150268555, "logps/chosen": -47.42876434326172, "logps/rejected": -52.91933822631836, "loss": 0.4533, "rewards/accuracies": 0.0, "rewards/chosen": 2.819988250732422, "rewards/margins": -0.3871731758117676, "rewards/rejected": 3.2071614265441895, "step": 786 }, { "epoch": 0.17, "learning_rate": 9.915039766077772e-06, "logits/chosen": -1.3400704860687256, "logits/rejected": -1.290535807609558, "logps/chosen": -139.26426696777344, "logps/rejected": -139.1394805908203, "loss": 0.7163, "rewards/accuracies": 0.0, "rewards/chosen": 4.681382656097412, "rewards/margins": -1.1009430885314941, "rewards/rejected": 5.782325744628906, "step": 787 }, { "epoch": 0.17, "learning_rate": 9.914710445234567e-06, "logits/chosen": -0.6593317985534668, "logits/rejected": -0.48088493943214417, "logps/chosen": -63.64142608642578, "logps/rejected": -26.72943687438965, "loss": 0.5916, "rewards/accuracies": 1.0, "rewards/chosen": 5.611375331878662, "rewards/margins": 4.017287731170654, "rewards/rejected": 1.5940874814987183, "step": 788 }, { "epoch": 0.17, "learning_rate": 9.914380492862883e-06, "logits/chosen": -1.0883530378341675, "logits/rejected": -1.0246524810791016, "logps/chosen": -42.277366638183594, "logps/rejected": -10.703709602355957, "loss": 1.7128, "rewards/accuracies": 1.0, "rewards/chosen": 2.49897837638855, "rewards/margins": 1.0846184492111206, "rewards/rejected": 1.4143599271774292, "step": 789 }, { "epoch": 0.17, "learning_rate": 9.91404990900512e-06, "logits/chosen": -1.1685353517532349, "logits/rejected": -1.2092639207839966, "logps/chosen": -46.646270751953125, "logps/rejected": -61.242881774902344, "loss": 1.9946, "rewards/accuracies": 0.0, "rewards/chosen": 3.260301351547241, "rewards/margins": -1.0580079555511475, "rewards/rejected": 4.318309307098389, "step": 790 }, { "epoch": 0.18, "learning_rate": 9.913718693703755e-06, "logits/chosen": -0.9893348217010498, "logits/rejected": -1.0466228723526, "logps/chosen": -42.441951751708984, "logps/rejected": -114.43196105957031, "loss": 1.233, "rewards/accuracies": 0.0, "rewards/chosen": 3.4626851081848145, "rewards/margins": -2.2856626510620117, "rewards/rejected": 5.748347759246826, "step": 791 }, { "epoch": 0.18, "learning_rate": 9.91338684700135e-06, "logits/chosen": -0.8529520034790039, "logits/rejected": -0.8803690671920776, "logps/chosen": -83.36311340332031, "logps/rejected": -31.647666931152344, "loss": 1.0936, "rewards/accuracies": 0.0, "rewards/chosen": 3.0583603382110596, "rewards/margins": -0.9860236644744873, "rewards/rejected": 4.044384002685547, "step": 792 }, { "epoch": 0.18, "learning_rate": 9.91305436894055e-06, "logits/chosen": -1.0538142919540405, "logits/rejected": -0.9923610687255859, "logps/chosen": -44.377296447753906, "logps/rejected": -70.7072982788086, "loss": 1.5677, "rewards/accuracies": 1.0, "rewards/chosen": 2.9111123085021973, "rewards/margins": 0.1670396327972412, "rewards/rejected": 2.744072675704956, "step": 793 }, { "epoch": 0.18, "learning_rate": 9.912721259564072e-06, "logits/chosen": -0.7674641609191895, "logits/rejected": -0.5956028699874878, "logps/chosen": -86.76667785644531, "logps/rejected": -43.0772705078125, "loss": 0.1992, "rewards/accuracies": 1.0, "rewards/chosen": 4.811026096343994, "rewards/margins": 2.6199951171875, "rewards/rejected": 2.191030979156494, "step": 794 }, { "epoch": 0.18, "learning_rate": 9.91238751891472e-06, "logits/chosen": -1.0859140157699585, "logits/rejected": -1.1211414337158203, "logps/chosen": -76.70657348632812, "logps/rejected": -90.6080322265625, "loss": 1.2473, "rewards/accuracies": 0.0, "rewards/chosen": 1.893513560295105, "rewards/margins": -2.224470615386963, "rewards/rejected": 4.117984294891357, "step": 795 }, { "epoch": 0.18, "learning_rate": 9.912053147035383e-06, "logits/chosen": -1.081286072731018, "logits/rejected": -0.9750898480415344, "logps/chosen": -127.0251693725586, "logps/rejected": -84.72887420654297, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 6.140763282775879, "rewards/margins": 2.823683261871338, "rewards/rejected": 3.317080020904541, "step": 796 }, { "epoch": 0.18, "learning_rate": 9.911718143969024e-06, "logits/chosen": -1.5835845470428467, "logits/rejected": -1.380234956741333, "logps/chosen": -136.04458618164062, "logps/rejected": -85.0711669921875, "loss": 0.0645, "rewards/accuracies": 1.0, "rewards/chosen": 6.833726406097412, "rewards/margins": 2.8327131271362305, "rewards/rejected": 4.001013278961182, "step": 797 }, { "epoch": 0.18, "learning_rate": 9.911382509758692e-06, "logits/chosen": -1.1962212324142456, "logits/rejected": -0.9706411957740784, "logps/chosen": -111.6672592163086, "logps/rejected": -33.96778869628906, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 6.412210941314697, "rewards/margins": 6.036309242248535, "rewards/rejected": 0.3759017884731293, "step": 798 }, { "epoch": 0.18, "learning_rate": 9.911046244447515e-06, "logits/chosen": -1.2341639995574951, "logits/rejected": -1.174668312072754, "logps/chosen": -72.15599060058594, "logps/rejected": -75.18940734863281, "loss": 0.9763, "rewards/accuracies": 0.0, "rewards/chosen": 2.3189315795898438, "rewards/margins": -1.755988597869873, "rewards/rejected": 4.074920177459717, "step": 799 }, { "epoch": 0.18, "learning_rate": 9.910709348078699e-06, "logits/chosen": -1.0335429906845093, "logits/rejected": -1.0594985485076904, "logps/chosen": -55.78291320800781, "logps/rejected": -37.954559326171875, "loss": 0.4879, "rewards/accuracies": 0.0, "rewards/chosen": 2.1200103759765625, "rewards/margins": -0.4656808376312256, "rewards/rejected": 2.585691213607788, "step": 800 }, { "epoch": 0.18, "learning_rate": 9.910371820695538e-06, "logits/chosen": -0.9402267336845398, "logits/rejected": -0.9088319540023804, "logps/chosen": -56.1148567199707, "logps/rejected": -61.8338737487793, "loss": 1.3793, "rewards/accuracies": 0.0, "rewards/chosen": 2.4649417400360107, "rewards/margins": -1.0883913040161133, "rewards/rejected": 3.553333044052124, "step": 801 }, { "epoch": 0.18, "learning_rate": 9.910033662341403e-06, "logits/chosen": -0.8825464844703674, "logits/rejected": -0.8863517045974731, "logps/chosen": -39.9671630859375, "logps/rejected": -40.50697708129883, "loss": 0.4748, "rewards/accuracies": 1.0, "rewards/chosen": 3.811279296875, "rewards/margins": 0.10611748695373535, "rewards/rejected": 3.7051618099212646, "step": 802 }, { "epoch": 0.18, "learning_rate": 9.909694873059745e-06, "logits/chosen": -1.2796801328659058, "logits/rejected": -1.2058866024017334, "logps/chosen": -113.21249389648438, "logps/rejected": -53.805328369140625, "loss": 0.3566, "rewards/accuracies": 1.0, "rewards/chosen": 5.982226848602295, "rewards/margins": 3.9020533561706543, "rewards/rejected": 2.0801734924316406, "step": 803 }, { "epoch": 0.18, "learning_rate": 9.909355452894098e-06, "logits/chosen": -1.1372157335281372, "logits/rejected": -0.9856467247009277, "logps/chosen": -78.99615478515625, "logps/rejected": -118.157470703125, "loss": 0.2912, "rewards/accuracies": 1.0, "rewards/chosen": 4.352087497711182, "rewards/margins": 0.24945974349975586, "rewards/rejected": 4.102627754211426, "step": 804 }, { "epoch": 0.18, "learning_rate": 9.909015401888077e-06, "logits/chosen": -0.9119741320610046, "logits/rejected": -0.9119741320610046, "logps/chosen": -41.146236419677734, "logps/rejected": -41.146236419677734, "loss": 1.5687, "rewards/accuracies": 0.0, "rewards/chosen": 0.6296192407608032, "rewards/margins": 0.0, "rewards/rejected": 0.6296192407608032, "step": 805 }, { "epoch": 0.18, "learning_rate": 9.908674720085378e-06, "logits/chosen": -1.115628719329834, "logits/rejected": -1.0817883014678955, "logps/chosen": -51.810794830322266, "logps/rejected": -61.9259147644043, "loss": 0.5311, "rewards/accuracies": 0.0, "rewards/chosen": 3.4078869819641113, "rewards/margins": -0.601315975189209, "rewards/rejected": 4.00920295715332, "step": 806 }, { "epoch": 0.18, "learning_rate": 9.908333407529779e-06, "logits/chosen": -1.0971860885620117, "logits/rejected": -0.9929226040840149, "logps/chosen": -30.174955368041992, "logps/rejected": -54.488101959228516, "loss": 1.0982, "rewards/accuracies": 1.0, "rewards/chosen": 2.388897180557251, "rewards/margins": 0.26213598251342773, "rewards/rejected": 2.1267611980438232, "step": 807 }, { "epoch": 0.18, "learning_rate": 9.907991464265136e-06, "logits/chosen": -1.434659481048584, "logits/rejected": -1.3587327003479004, "logps/chosen": -112.97744750976562, "logps/rejected": -111.28132629394531, "loss": 0.9006, "rewards/accuracies": 0.0, "rewards/chosen": 6.710402011871338, "rewards/margins": -0.6916899681091309, "rewards/rejected": 7.402091979980469, "step": 808 }, { "epoch": 0.18, "learning_rate": 9.907648890335387e-06, "logits/chosen": -1.3112529516220093, "logits/rejected": -1.1772359609603882, "logps/chosen": -67.07899475097656, "logps/rejected": -53.89457702636719, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 2.8135299682617188, "rewards/margins": 1.5456489324569702, "rewards/rejected": 1.2678810358047485, "step": 809 }, { "epoch": 0.18, "learning_rate": 9.907305685784553e-06, "logits/chosen": -1.1039601564407349, "logits/rejected": -1.009089469909668, "logps/chosen": -53.1840705871582, "logps/rejected": -50.60285186767578, "loss": 0.2645, "rewards/accuracies": 1.0, "rewards/chosen": 3.054948091506958, "rewards/margins": 0.3728008270263672, "rewards/rejected": 2.682147264480591, "step": 810 }, { "epoch": 0.18, "learning_rate": 9.906961850656737e-06, "logits/chosen": -1.214077353477478, "logits/rejected": -0.8908031582832336, "logps/chosen": -109.87080383300781, "logps/rejected": -52.11023712158203, "loss": 0.0984, "rewards/accuracies": 1.0, "rewards/chosen": 6.107844829559326, "rewards/margins": 4.380435943603516, "rewards/rejected": 1.7274086475372314, "step": 811 }, { "epoch": 0.18, "learning_rate": 9.906617384996118e-06, "logits/chosen": -1.0357849597930908, "logits/rejected": -0.9542733430862427, "logps/chosen": -97.78388977050781, "logps/rejected": -70.93548583984375, "loss": 0.3191, "rewards/accuracies": 1.0, "rewards/chosen": 4.226904392242432, "rewards/margins": 0.15236949920654297, "rewards/rejected": 4.074534893035889, "step": 812 }, { "epoch": 0.18, "learning_rate": 9.906272288846962e-06, "logits/chosen": -1.2850533723831177, "logits/rejected": -1.2963571548461914, "logps/chosen": -91.40299224853516, "logps/rejected": -69.17330932617188, "loss": 1.2298, "rewards/accuracies": 0.0, "rewards/chosen": 0.7210411429405212, "rewards/margins": -1.915076494216919, "rewards/rejected": 2.636117696762085, "step": 813 }, { "epoch": 0.18, "learning_rate": 9.90592656225361e-06, "logits/chosen": -1.0799516439437866, "logits/rejected": -1.0799516439437866, "logps/chosen": -71.06942749023438, "logps/rejected": -71.06942749023438, "loss": 0.506, "rewards/accuracies": 0.0, "rewards/chosen": 2.4812142848968506, "rewards/margins": 0.0, "rewards/rejected": 2.4812142848968506, "step": 814 }, { "epoch": 0.18, "learning_rate": 9.905580205260487e-06, "logits/chosen": -1.3082655668258667, "logits/rejected": -1.1618229150772095, "logps/chosen": -79.45481872558594, "logps/rejected": -82.33330535888672, "loss": 0.9768, "rewards/accuracies": 0.0, "rewards/chosen": 7.177180767059326, "rewards/margins": -0.4318366050720215, "rewards/rejected": 7.609017372131348, "step": 815 }, { "epoch": 0.18, "learning_rate": 9.905233217912102e-06, "logits/chosen": -1.070644736289978, "logits/rejected": -1.036794662475586, "logps/chosen": -47.82432556152344, "logps/rejected": -32.83163833618164, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": 4.538389682769775, "rewards/margins": 1.8739266395568848, "rewards/rejected": 2.6644630432128906, "step": 816 }, { "epoch": 0.18, "learning_rate": 9.904885600253038e-06, "logits/chosen": -1.2940601110458374, "logits/rejected": -1.2940601110458374, "logps/chosen": -94.02394104003906, "logps/rejected": -94.02394104003906, "loss": 0.3524, "rewards/accuracies": 0.0, "rewards/chosen": 4.529402256011963, "rewards/margins": 0.0, "rewards/rejected": 4.529402256011963, "step": 817 }, { "epoch": 0.18, "learning_rate": 9.904537352327968e-06, "logits/chosen": -0.9511178731918335, "logits/rejected": -0.9511178731918335, "logps/chosen": -43.013126373291016, "logps/rejected": -43.013126373291016, "loss": 0.5482, "rewards/accuracies": 0.0, "rewards/chosen": 0.9090687036514282, "rewards/margins": 0.0, "rewards/rejected": 0.9090687036514282, "step": 818 }, { "epoch": 0.18, "learning_rate": 9.904188474181637e-06, "logits/chosen": -0.6656609773635864, "logits/rejected": -0.6961214542388916, "logps/chosen": -17.98586654663086, "logps/rejected": -28.787473678588867, "loss": 1.6699, "rewards/accuracies": 0.0, "rewards/chosen": 1.171000361442566, "rewards/margins": -0.7132744789123535, "rewards/rejected": 1.8842748403549194, "step": 819 }, { "epoch": 0.18, "learning_rate": 9.903838965858877e-06, "logits/chosen": -0.8402917385101318, "logits/rejected": -0.8136827349662781, "logps/chosen": -45.31509017944336, "logps/rejected": -44.81433868408203, "loss": 0.4109, "rewards/accuracies": 0.0, "rewards/chosen": 2.2241628170013428, "rewards/margins": -0.1354694366455078, "rewards/rejected": 2.3596322536468506, "step": 820 }, { "epoch": 0.18, "learning_rate": 9.9034888274046e-06, "logits/chosen": -1.207231879234314, "logits/rejected": -1.1195257902145386, "logps/chosen": -125.68586730957031, "logps/rejected": -106.33623504638672, "loss": 0.0811, "rewards/accuracies": 1.0, "rewards/chosen": 5.187110900878906, "rewards/margins": 1.742760419845581, "rewards/rejected": 3.444350481033325, "step": 821 }, { "epoch": 0.18, "learning_rate": 9.903138058863793e-06, "logits/chosen": -1.1469026803970337, "logits/rejected": -1.0994617938995361, "logps/chosen": -49.78434753417969, "logps/rejected": -40.999725341796875, "loss": 0.6625, "rewards/accuracies": 1.0, "rewards/chosen": 2.859375, "rewards/margins": 0.1266188621520996, "rewards/rejected": 2.7327561378479004, "step": 822 }, { "epoch": 0.18, "learning_rate": 9.902786660281533e-06, "logits/chosen": -0.9832112789154053, "logits/rejected": -1.030719518661499, "logps/chosen": -16.648286819458008, "logps/rejected": -33.45597457885742, "loss": 1.2458, "rewards/accuracies": 0.0, "rewards/chosen": 1.2754637002944946, "rewards/margins": -2.0889639854431152, "rewards/rejected": 3.3644275665283203, "step": 823 }, { "epoch": 0.18, "learning_rate": 9.902434631702976e-06, "logits/chosen": -0.9727417230606079, "logits/rejected": -0.9710721969604492, "logps/chosen": -58.76342010498047, "logps/rejected": -53.918067932128906, "loss": 0.5685, "rewards/accuracies": 0.0, "rewards/chosen": 2.274967908859253, "rewards/margins": -0.7490990161895752, "rewards/rejected": 3.024066925048828, "step": 824 }, { "epoch": 0.18, "learning_rate": 9.902081973173352e-06, "logits/chosen": -1.1277762651443481, "logits/rejected": -1.1277762651443481, "logps/chosen": -95.29116821289062, "logps/rejected": -95.29116821289062, "loss": 0.3754, "rewards/accuracies": 0.0, "rewards/chosen": 4.302623271942139, "rewards/margins": 0.0, "rewards/rejected": 4.302623271942139, "step": 825 }, { "epoch": 0.18, "learning_rate": 9.901728684737977e-06, "logits/chosen": -1.1690161228179932, "logits/rejected": -1.4399725198745728, "logps/chosen": -76.09149169921875, "logps/rejected": -37.770729064941406, "loss": 0.3888, "rewards/accuracies": 1.0, "rewards/chosen": 2.3815994262695312, "rewards/margins": 0.4750465154647827, "rewards/rejected": 1.9065529108047485, "step": 826 }, { "epoch": 0.18, "learning_rate": 9.901374766442252e-06, "logits/chosen": -1.2479056119918823, "logits/rejected": -1.3219938278198242, "logps/chosen": -94.05443572998047, "logps/rejected": -91.1262435913086, "loss": 3.6337, "rewards/accuracies": 0.0, "rewards/chosen": 1.9306557178497314, "rewards/margins": -6.5729217529296875, "rewards/rejected": 8.50357723236084, "step": 827 }, { "epoch": 0.18, "learning_rate": 9.901020218331652e-06, "logits/chosen": -1.307057499885559, "logits/rejected": -1.307057499885559, "logps/chosen": -39.66059112548828, "logps/rejected": -39.66059112548828, "loss": 0.8681, "rewards/accuracies": 0.0, "rewards/chosen": 0.8495556116104126, "rewards/margins": 0.0, "rewards/rejected": 0.8495556116104126, "step": 828 }, { "epoch": 0.18, "learning_rate": 9.900665040451735e-06, "logits/chosen": -1.113857626914978, "logits/rejected": -1.1505743265151978, "logps/chosen": -63.554412841796875, "logps/rejected": -107.70701599121094, "loss": 1.9175, "rewards/accuracies": 0.0, "rewards/chosen": 2.9491043090820312, "rewards/margins": -0.3792099952697754, "rewards/rejected": 3.3283143043518066, "step": 829 }, { "epoch": 0.18, "learning_rate": 9.90030923284814e-06, "logits/chosen": -1.2862292528152466, "logits/rejected": -1.3353832960128784, "logps/chosen": -54.30952453613281, "logps/rejected": -35.84076690673828, "loss": 1.1256, "rewards/accuracies": 0.0, "rewards/chosen": 1.6292526721954346, "rewards/margins": -1.0254158973693848, "rewards/rejected": 2.6546685695648193, "step": 830 }, { "epoch": 0.18, "learning_rate": 9.89995279556659e-06, "logits/chosen": -0.7833419442176819, "logits/rejected": -0.8423076868057251, "logps/chosen": -60.49567413330078, "logps/rejected": -77.13145446777344, "loss": 0.6608, "rewards/accuracies": 0.0, "rewards/chosen": 2.9451286792755127, "rewards/margins": -0.720008134841919, "rewards/rejected": 3.6651368141174316, "step": 831 }, { "epoch": 0.18, "learning_rate": 9.899595728652883e-06, "logits/chosen": -1.078598976135254, "logits/rejected": -1.0846507549285889, "logps/chosen": -68.23933410644531, "logps/rejected": -42.20820617675781, "loss": 0.5568, "rewards/accuracies": 0.0, "rewards/chosen": 2.293940782546997, "rewards/margins": -0.6545090675354004, "rewards/rejected": 2.9484498500823975, "step": 832 }, { "epoch": 0.18, "learning_rate": 9.899238032152907e-06, "logits/chosen": -1.039074420928955, "logits/rejected": -0.9618403315544128, "logps/chosen": -35.740325927734375, "logps/rejected": -30.78211212158203, "loss": 1.0159, "rewards/accuracies": 1.0, "rewards/chosen": 2.0525691509246826, "rewards/margins": 0.5822199583053589, "rewards/rejected": 1.4703491926193237, "step": 833 }, { "epoch": 0.18, "learning_rate": 9.898879706112618e-06, "logits/chosen": -1.279819130897522, "logits/rejected": -1.18146550655365, "logps/chosen": -143.36781311035156, "logps/rejected": -51.60980224609375, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 5.493711948394775, "rewards/margins": 3.22635817527771, "rewards/rejected": 2.2673537731170654, "step": 834 }, { "epoch": 0.18, "learning_rate": 9.898520750578065e-06, "logits/chosen": -1.3773164749145508, "logits/rejected": -1.2748878002166748, "logps/chosen": -79.5528793334961, "logps/rejected": -67.05526733398438, "loss": 0.2758, "rewards/accuracies": 1.0, "rewards/chosen": 6.549178600311279, "rewards/margins": 1.6494593620300293, "rewards/rejected": 4.89971923828125, "step": 835 }, { "epoch": 0.19, "learning_rate": 9.898161165595371e-06, "logits/chosen": -1.2673778533935547, "logits/rejected": -1.2247488498687744, "logps/chosen": -126.30359649658203, "logps/rejected": -93.57243347167969, "loss": 0.3355, "rewards/accuracies": 1.0, "rewards/chosen": 6.753296852111816, "rewards/margins": 0.057627201080322266, "rewards/rejected": 6.695669651031494, "step": 836 }, { "epoch": 0.19, "learning_rate": 9.897800951210741e-06, "logits/chosen": -1.1425001621246338, "logits/rejected": -1.0950959920883179, "logps/chosen": -81.32443237304688, "logps/rejected": -82.71774291992188, "loss": 0.3993, "rewards/accuracies": 1.0, "rewards/chosen": 2.622859239578247, "rewards/margins": 0.08941268920898438, "rewards/rejected": 2.5334465503692627, "step": 837 }, { "epoch": 0.19, "learning_rate": 9.897440107470463e-06, "logits/chosen": -1.0283267498016357, "logits/rejected": -1.0133452415466309, "logps/chosen": -77.04598236083984, "logps/rejected": -91.77979278564453, "loss": 1.3818, "rewards/accuracies": 0.0, "rewards/chosen": 1.3773910999298096, "rewards/margins": -2.1569793224334717, "rewards/rejected": 3.5343704223632812, "step": 838 }, { "epoch": 0.19, "learning_rate": 9.897078634420905e-06, "logits/chosen": -1.4179131984710693, "logits/rejected": -1.278337836265564, "logps/chosen": -133.8927459716797, "logps/rejected": -60.943870544433594, "loss": 0.5658, "rewards/accuracies": 1.0, "rewards/chosen": 8.338932991027832, "rewards/margins": 5.00681209564209, "rewards/rejected": 3.332120656967163, "step": 839 }, { "epoch": 0.19, "learning_rate": 9.896716532108515e-06, "logits/chosen": -1.1012786626815796, "logits/rejected": -1.1308629512786865, "logps/chosen": -124.18366241455078, "logps/rejected": -93.01783752441406, "loss": 0.7742, "rewards/accuracies": 1.0, "rewards/chosen": 5.9745659828186035, "rewards/margins": 1.2494330406188965, "rewards/rejected": 4.725132942199707, "step": 840 }, { "epoch": 0.19, "learning_rate": 9.896353800579823e-06, "logits/chosen": -0.8081870079040527, "logits/rejected": -0.7110928893089294, "logps/chosen": -39.343536376953125, "logps/rejected": -51.147003173828125, "loss": 1.0174, "rewards/accuracies": 1.0, "rewards/chosen": 2.3611741065979004, "rewards/margins": 0.771072506904602, "rewards/rejected": 1.5901015996932983, "step": 841 }, { "epoch": 0.19, "learning_rate": 9.895990439881436e-06, "logits/chosen": -1.0277310609817505, "logits/rejected": -1.0799732208251953, "logps/chosen": -60.164947509765625, "logps/rejected": -50.44982147216797, "loss": 2.3064, "rewards/accuracies": 0.0, "rewards/chosen": 1.3332337141036987, "rewards/margins": -1.5064178705215454, "rewards/rejected": 2.839651584625244, "step": 842 }, { "epoch": 0.19, "learning_rate": 9.895626450060047e-06, "logits/chosen": -0.986717700958252, "logits/rejected": -0.8739109635353088, "logps/chosen": -130.8045196533203, "logps/rejected": -73.61346435546875, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 5.876991271972656, "rewards/margins": 2.149822950363159, "rewards/rejected": 3.727168321609497, "step": 843 }, { "epoch": 0.19, "learning_rate": 9.89526183116243e-06, "logits/chosen": -1.1761467456817627, "logits/rejected": -1.1926473379135132, "logps/chosen": -55.12879943847656, "logps/rejected": -168.87527465820312, "loss": 2.129, "rewards/accuracies": 0.0, "rewards/chosen": 3.5485000610351562, "rewards/margins": -4.230513095855713, "rewards/rejected": 7.779013156890869, "step": 844 }, { "epoch": 0.19, "learning_rate": 9.894896583235434e-06, "logits/chosen": -1.3094007968902588, "logits/rejected": -1.3440515995025635, "logps/chosen": -111.20849609375, "logps/rejected": -89.09590148925781, "loss": 2.0464, "rewards/accuracies": 0.0, "rewards/chosen": 4.07852029800415, "rewards/margins": -2.6190171241760254, "rewards/rejected": 6.697537422180176, "step": 845 }, { "epoch": 0.19, "learning_rate": 9.894530706325994e-06, "logits/chosen": -1.3688700199127197, "logits/rejected": -1.2145830392837524, "logps/chosen": -166.9005584716797, "logps/rejected": -87.70384216308594, "loss": 0.1961, "rewards/accuracies": 1.0, "rewards/chosen": 5.758983135223389, "rewards/margins": 0.76556396484375, "rewards/rejected": 4.993419170379639, "step": 846 }, { "epoch": 0.19, "learning_rate": 9.894164200481124e-06, "logits/chosen": -0.9774512648582458, "logits/rejected": -0.9774512648582458, "logps/chosen": -23.346378326416016, "logps/rejected": -23.346378326416016, "loss": 1.219, "rewards/accuracies": 0.0, "rewards/chosen": 1.3636707067489624, "rewards/margins": 0.0, "rewards/rejected": 1.3636707067489624, "step": 847 }, { "epoch": 0.19, "learning_rate": 9.89379706574792e-06, "logits/chosen": -0.9592823386192322, "logits/rejected": -0.9075645804405212, "logps/chosen": -47.612548828125, "logps/rejected": -61.703453063964844, "loss": 0.2848, "rewards/accuracies": 1.0, "rewards/chosen": 2.943272352218628, "rewards/margins": 0.2651541233062744, "rewards/rejected": 2.6781182289123535, "step": 848 }, { "epoch": 0.19, "learning_rate": 9.893429302173558e-06, "logits/chosen": -1.1970171928405762, "logits/rejected": -1.0410624742507935, "logps/chosen": -82.84436798095703, "logps/rejected": -67.14705657958984, "loss": 0.2073, "rewards/accuracies": 1.0, "rewards/chosen": 2.9150140285491943, "rewards/margins": 0.6686546802520752, "rewards/rejected": 2.246359348297119, "step": 849 }, { "epoch": 0.19, "learning_rate": 9.893060909805294e-06, "logits/chosen": -0.8697724342346191, "logits/rejected": -0.8278974890708923, "logps/chosen": -48.54055404663086, "logps/rejected": -35.69525909423828, "loss": 0.5779, "rewards/accuracies": 1.0, "rewards/chosen": 2.4179980754852295, "rewards/margins": 0.5307841300964355, "rewards/rejected": 1.887213945388794, "step": 850 }, { "epoch": 0.19, "learning_rate": 9.892691888690466e-06, "logits/chosen": -1.005776286125183, "logits/rejected": -1.0775675773620605, "logps/chosen": -106.69529724121094, "logps/rejected": -120.36997985839844, "loss": 0.3483, "rewards/accuracies": 1.0, "rewards/chosen": 6.1904616355896, "rewards/margins": 2.4033429622650146, "rewards/rejected": 3.787118673324585, "step": 851 }, { "epoch": 0.19, "learning_rate": 9.892322238876492e-06, "logits/chosen": -1.040341854095459, "logits/rejected": -1.040341854095459, "logps/chosen": -30.628246307373047, "logps/rejected": -30.628246307373047, "loss": 0.8597, "rewards/accuracies": 0.0, "rewards/chosen": 2.346470355987549, "rewards/margins": 0.0, "rewards/rejected": 2.346470355987549, "step": 852 }, { "epoch": 0.19, "learning_rate": 9.89195196041087e-06, "logits/chosen": -1.206430435180664, "logits/rejected": -1.2126210927963257, "logps/chosen": -104.116455078125, "logps/rejected": -105.68299102783203, "loss": 0.7758, "rewards/accuracies": 0.0, "rewards/chosen": 5.700749397277832, "rewards/margins": -1.305264949798584, "rewards/rejected": 7.006014347076416, "step": 853 }, { "epoch": 0.19, "learning_rate": 9.891581053341182e-06, "logits/chosen": -1.081734538078308, "logits/rejected": -0.8481799364089966, "logps/chosen": -170.20388793945312, "logps/rejected": -97.17500305175781, "loss": 0.5086, "rewards/accuracies": 0.0, "rewards/chosen": 5.284081935882568, "rewards/margins": -0.5293002128601074, "rewards/rejected": 5.813382148742676, "step": 854 }, { "epoch": 0.19, "learning_rate": 9.891209517715088e-06, "logits/chosen": -1.396124243736267, "logits/rejected": -1.38882315158844, "logps/chosen": -63.954063415527344, "logps/rejected": -91.2056884765625, "loss": 0.2368, "rewards/accuracies": 1.0, "rewards/chosen": 3.3344902992248535, "rewards/margins": 0.547792911529541, "rewards/rejected": 2.7866973876953125, "step": 855 }, { "epoch": 0.19, "learning_rate": 9.890837353580327e-06, "logits/chosen": -1.4902279376983643, "logits/rejected": -1.4171620607376099, "logps/chosen": -128.93829345703125, "logps/rejected": -96.96610260009766, "loss": 2.4214, "rewards/accuracies": 0.0, "rewards/chosen": 7.123818874359131, "rewards/margins": -0.33759403228759766, "rewards/rejected": 7.4614129066467285, "step": 856 }, { "epoch": 0.19, "learning_rate": 9.890464560984725e-06, "logits/chosen": -1.168135643005371, "logits/rejected": -1.0738798379898071, "logps/chosen": -117.57525634765625, "logps/rejected": -60.76231002807617, "loss": 0.4459, "rewards/accuracies": 0.0, "rewards/chosen": 4.116626262664795, "rewards/margins": -0.2513904571533203, "rewards/rejected": 4.368016719818115, "step": 857 }, { "epoch": 0.19, "learning_rate": 9.890091139976183e-06, "logits/chosen": -1.0569454431533813, "logits/rejected": -0.9972559809684753, "logps/chosen": -77.0731201171875, "logps/rejected": -97.32003021240234, "loss": 1.6258, "rewards/accuracies": 1.0, "rewards/chosen": 4.815974712371826, "rewards/margins": 1.447709083557129, "rewards/rejected": 3.3682656288146973, "step": 858 }, { "epoch": 0.19, "learning_rate": 9.889717090602685e-06, "logits/chosen": -1.1450066566467285, "logits/rejected": -1.200321912765503, "logps/chosen": -95.00345611572266, "logps/rejected": -83.41441345214844, "loss": 1.1438, "rewards/accuracies": 0.0, "rewards/chosen": 4.4311089515686035, "rewards/margins": -2.0073113441467285, "rewards/rejected": 6.438420295715332, "step": 859 }, { "epoch": 0.19, "learning_rate": 9.889342412912296e-06, "logits/chosen": -0.9687401652336121, "logits/rejected": -0.927932620048523, "logps/chosen": -43.151554107666016, "logps/rejected": -29.282424926757812, "loss": 0.8017, "rewards/accuracies": 0.0, "rewards/chosen": 0.8754909634590149, "rewards/margins": -0.39526182413101196, "rewards/rejected": 1.2707527875900269, "step": 860 }, { "epoch": 0.19, "learning_rate": 9.88896710695316e-06, "logits/chosen": -0.9831348657608032, "logits/rejected": -0.9373592138290405, "logps/chosen": -24.955446243286133, "logps/rejected": -38.70436096191406, "loss": 0.6222, "rewards/accuracies": 0.0, "rewards/chosen": 0.990590512752533, "rewards/margins": -0.8742937445640564, "rewards/rejected": 1.8648842573165894, "step": 861 }, { "epoch": 0.19, "learning_rate": 9.888591172773502e-06, "logits/chosen": -1.1611524820327759, "logits/rejected": -1.0520814657211304, "logps/chosen": -67.46104431152344, "logps/rejected": -109.36293029785156, "loss": 0.2456, "rewards/accuracies": 1.0, "rewards/chosen": 1.9902664422988892, "rewards/margins": 0.4607902765274048, "rewards/rejected": 1.5294761657714844, "step": 862 }, { "epoch": 0.19, "learning_rate": 9.888214610421633e-06, "logits/chosen": -1.1112357378005981, "logits/rejected": -0.8363744020462036, "logps/chosen": -104.50685119628906, "logps/rejected": -58.56574249267578, "loss": 1.73, "rewards/accuracies": 1.0, "rewards/chosen": 4.571119785308838, "rewards/margins": 1.6156487464904785, "rewards/rejected": 2.9554710388183594, "step": 863 }, { "epoch": 0.19, "learning_rate": 9.887837419945937e-06, "logits/chosen": -1.5651031732559204, "logits/rejected": -1.4362138509750366, "logps/chosen": -100.411865234375, "logps/rejected": -61.21220779418945, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 5.404069423675537, "rewards/margins": 3.7356913089752197, "rewards/rejected": 1.6683781147003174, "step": 864 }, { "epoch": 0.19, "learning_rate": 9.887459601394881e-06, "logits/chosen": -1.0815801620483398, "logits/rejected": -1.0610285997390747, "logps/chosen": -49.56546401977539, "logps/rejected": -77.18498992919922, "loss": 1.4687, "rewards/accuracies": 0.0, "rewards/chosen": 0.8550540804862976, "rewards/margins": -2.708754301071167, "rewards/rejected": 3.5638084411621094, "step": 865 }, { "epoch": 0.19, "learning_rate": 9.887081154817015e-06, "logits/chosen": -1.1278678178787231, "logits/rejected": -1.1278678178787231, "logps/chosen": -64.81282043457031, "logps/rejected": -64.81282043457031, "loss": 0.7221, "rewards/accuracies": 0.0, "rewards/chosen": 3.8523330688476562, "rewards/margins": 0.0, "rewards/rejected": 3.8523330688476562, "step": 866 }, { "epoch": 0.19, "learning_rate": 9.88670208026097e-06, "logits/chosen": -1.4038234949111938, "logits/rejected": -1.3684146404266357, "logps/chosen": -67.80461120605469, "logps/rejected": -84.2885971069336, "loss": 0.1308, "rewards/accuracies": 1.0, "rewards/chosen": 5.141410827636719, "rewards/margins": 1.2206878662109375, "rewards/rejected": 3.9207229614257812, "step": 867 }, { "epoch": 0.19, "learning_rate": 9.886322377775455e-06, "logits/chosen": -1.0534225702285767, "logits/rejected": -0.9515531659126282, "logps/chosen": -59.02688217163086, "logps/rejected": -60.52180480957031, "loss": 0.2541, "rewards/accuracies": 1.0, "rewards/chosen": 2.2127552032470703, "rewards/margins": 0.43706321716308594, "rewards/rejected": 1.7756919860839844, "step": 868 }, { "epoch": 0.19, "learning_rate": 9.885942047409262e-06, "logits/chosen": -1.2766178846359253, "logits/rejected": -1.2870627641677856, "logps/chosen": -126.11187744140625, "logps/rejected": -170.2183380126953, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 3.5279908180236816, "rewards/margins": 1.0196945667266846, "rewards/rejected": 2.508296251296997, "step": 869 }, { "epoch": 0.19, "learning_rate": 9.885561089211259e-06, "logits/chosen": -0.9587857127189636, "logits/rejected": -0.9016436338424683, "logps/chosen": -29.109275817871094, "logps/rejected": -7.133547306060791, "loss": 2.0761, "rewards/accuracies": 1.0, "rewards/chosen": 2.3000829219818115, "rewards/margins": 1.920212984085083, "rewards/rejected": 0.3798699975013733, "step": 870 }, { "epoch": 0.19, "learning_rate": 9.885179503230403e-06, "logits/chosen": -1.0014716386795044, "logits/rejected": -0.8767028450965881, "logps/chosen": -39.11731719970703, "logps/rejected": -21.090620040893555, "loss": 0.3814, "rewards/accuracies": 1.0, "rewards/chosen": 1.5278023481369019, "rewards/margins": 0.5955340266227722, "rewards/rejected": 0.9322683215141296, "step": 871 }, { "epoch": 0.19, "learning_rate": 9.884797289515723e-06, "logits/chosen": -0.9079892039299011, "logits/rejected": -0.7876458168029785, "logps/chosen": -46.56914520263672, "logps/rejected": -74.13622283935547, "loss": 1.1739, "rewards/accuracies": 0.0, "rewards/chosen": 2.0946297645568848, "rewards/margins": -1.1230766773223877, "rewards/rejected": 3.2177064418792725, "step": 872 }, { "epoch": 0.19, "learning_rate": 9.884414448116335e-06, "logits/chosen": -0.9894654750823975, "logits/rejected": -0.9771995544433594, "logps/chosen": -47.95357894897461, "logps/rejected": -45.663063049316406, "loss": 2.9585, "rewards/accuracies": 0.0, "rewards/chosen": 1.7850894927978516, "rewards/margins": -0.5358676910400391, "rewards/rejected": 2.3209571838378906, "step": 873 }, { "epoch": 0.19, "learning_rate": 9.88403097908143e-06, "logits/chosen": -1.1568418741226196, "logits/rejected": -1.1735936403274536, "logps/chosen": -45.55965042114258, "logps/rejected": -36.35151290893555, "loss": 0.5955, "rewards/accuracies": 0.0, "rewards/chosen": 2.240037202835083, "rewards/margins": -0.8279449939727783, "rewards/rejected": 3.0679821968078613, "step": 874 }, { "epoch": 0.19, "learning_rate": 9.883646882460287e-06, "logits/chosen": -1.5075814723968506, "logits/rejected": -1.5168224573135376, "logps/chosen": -69.774658203125, "logps/rejected": -60.62721633911133, "loss": 1.2692, "rewards/accuracies": 0.0, "rewards/chosen": 1.9818146228790283, "rewards/margins": -1.6071619987487793, "rewards/rejected": 3.5889766216278076, "step": 875 }, { "epoch": 0.19, "learning_rate": 9.883262158302259e-06, "logits/chosen": -1.1480106115341187, "logits/rejected": -1.031376838684082, "logps/chosen": -171.36355590820312, "logps/rejected": -68.06375122070312, "loss": 0.7643, "rewards/accuracies": 1.0, "rewards/chosen": 5.018077373504639, "rewards/margins": 0.08230161666870117, "rewards/rejected": 4.9357757568359375, "step": 876 }, { "epoch": 0.19, "learning_rate": 9.882876806656783e-06, "logits/chosen": -0.987391471862793, "logits/rejected": -0.9087944030761719, "logps/chosen": -82.74596405029297, "logps/rejected": -55.919044494628906, "loss": 0.5092, "rewards/accuracies": 0.0, "rewards/chosen": 2.1097023487091064, "rewards/margins": -0.5635063648223877, "rewards/rejected": 2.673208713531494, "step": 877 }, { "epoch": 0.19, "learning_rate": 9.882490827573375e-06, "logits/chosen": -1.7431443929672241, "logits/rejected": -1.0635628700256348, "logps/chosen": -100.95349884033203, "logps/rejected": -119.19971466064453, "loss": 0.7612, "rewards/accuracies": 0.0, "rewards/chosen": 2.0711257457733154, "rewards/margins": -0.6138086318969727, "rewards/rejected": 2.684934377670288, "step": 878 }, { "epoch": 0.19, "learning_rate": 9.882104221101634e-06, "logits/chosen": -1.4808545112609863, "logits/rejected": -1.4805814027786255, "logps/chosen": -62.6119384765625, "logps/rejected": -61.10277557373047, "loss": 0.4698, "rewards/accuracies": 0.0, "rewards/chosen": 4.33761739730835, "rewards/margins": -0.3747749328613281, "rewards/rejected": 4.712392330169678, "step": 879 }, { "epoch": 0.19, "learning_rate": 9.881716987291235e-06, "logits/chosen": -1.1427114009857178, "logits/rejected": -1.1427114009857178, "logps/chosen": -5.2916436195373535, "logps/rejected": -5.2916436195373535, "loss": 1.1141, "rewards/accuracies": 0.0, "rewards/chosen": 0.5373521447181702, "rewards/margins": 0.0, "rewards/rejected": 0.5373521447181702, "step": 880 }, { "epoch": 0.19, "learning_rate": 9.88132912619194e-06, "logits/chosen": -1.2755810022354126, "logits/rejected": -1.0085220336914062, "logps/chosen": -171.7745361328125, "logps/rejected": -81.8896713256836, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": 7.254185676574707, "rewards/margins": 4.600088119506836, "rewards/rejected": 2.65409779548645, "step": 881 }, { "epoch": 0.2, "learning_rate": 9.880940637853585e-06, "logits/chosen": -1.1752089262008667, "logits/rejected": -1.1388345956802368, "logps/chosen": -64.87130737304688, "logps/rejected": -47.03418731689453, "loss": 0.2797, "rewards/accuracies": 1.0, "rewards/chosen": 3.6755218505859375, "rewards/margins": 0.5114874839782715, "rewards/rejected": 3.164034366607666, "step": 882 }, { "epoch": 0.2, "learning_rate": 9.880551522326093e-06, "logits/chosen": -1.4632822275161743, "logits/rejected": -1.4859848022460938, "logps/chosen": -99.47480773925781, "logps/rejected": -147.49095153808594, "loss": 0.2491, "rewards/accuracies": 1.0, "rewards/chosen": 6.702769756317139, "rewards/margins": 0.6080231666564941, "rewards/rejected": 6.0947465896606445, "step": 883 }, { "epoch": 0.2, "learning_rate": 9.880161779659463e-06, "logits/chosen": -0.9278181195259094, "logits/rejected": -1.008902907371521, "logps/chosen": -61.318382263183594, "logps/rejected": -96.73678588867188, "loss": 2.332, "rewards/accuracies": 0.0, "rewards/chosen": 1.6462913751602173, "rewards/margins": -3.871540069580078, "rewards/rejected": 5.517831325531006, "step": 884 }, { "epoch": 0.2, "learning_rate": 9.879771409903775e-06, "logits/chosen": -1.2204999923706055, "logits/rejected": -1.0692775249481201, "logps/chosen": -127.55029296875, "logps/rejected": -40.01226806640625, "loss": 0.1364, "rewards/accuracies": 1.0, "rewards/chosen": 5.634030342102051, "rewards/margins": 2.966947317123413, "rewards/rejected": 2.6670830249786377, "step": 885 }, { "epoch": 0.2, "learning_rate": 9.879380413109193e-06, "logits/chosen": -1.346639633178711, "logits/rejected": -1.3289505243301392, "logps/chosen": -101.98974609375, "logps/rejected": -55.42812728881836, "loss": 0.0852, "rewards/accuracies": 1.0, "rewards/chosen": 6.486157417297363, "rewards/margins": 3.497019052505493, "rewards/rejected": 2.98913836479187, "step": 886 }, { "epoch": 0.2, "learning_rate": 9.878988789325955e-06, "logits/chosen": -1.3001115322113037, "logits/rejected": -1.2495784759521484, "logps/chosen": -43.735923767089844, "logps/rejected": -60.385009765625, "loss": 2.109, "rewards/accuracies": 0.0, "rewards/chosen": 1.552815318107605, "rewards/margins": -0.8161636590957642, "rewards/rejected": 2.368978977203369, "step": 887 }, { "epoch": 0.2, "learning_rate": 9.878596538604388e-06, "logits/chosen": -1.144651174545288, "logits/rejected": -0.9925026893615723, "logps/chosen": -59.931785583496094, "logps/rejected": -24.512935638427734, "loss": 0.3418, "rewards/accuracies": 1.0, "rewards/chosen": 2.5729119777679443, "rewards/margins": 1.937488317489624, "rewards/rejected": 0.6354236602783203, "step": 888 }, { "epoch": 0.2, "learning_rate": 9.878203660994894e-06, "logits/chosen": -1.0637212991714478, "logits/rejected": -1.0504837036132812, "logps/chosen": -67.44575500488281, "logps/rejected": -62.59389877319336, "loss": 0.9195, "rewards/accuracies": 0.0, "rewards/chosen": 3.500041961669922, "rewards/margins": -0.5670666694641113, "rewards/rejected": 4.067108631134033, "step": 889 }, { "epoch": 0.2, "learning_rate": 9.877810156547956e-06, "logits/chosen": -1.0564247369766235, "logits/rejected": -0.9565896987915039, "logps/chosen": -129.72988891601562, "logps/rejected": -65.7274169921875, "loss": 0.4778, "rewards/accuracies": 1.0, "rewards/chosen": 4.21408224105835, "rewards/margins": 1.2039778232574463, "rewards/rejected": 3.0101044178009033, "step": 890 }, { "epoch": 0.2, "learning_rate": 9.877416025314139e-06, "logits/chosen": -0.9273105263710022, "logits/rejected": -0.8431763648986816, "logps/chosen": -68.31631469726562, "logps/rejected": -40.58643341064453, "loss": 0.7919, "rewards/accuracies": 0.0, "rewards/chosen": 2.020193576812744, "rewards/margins": -1.3329253196716309, "rewards/rejected": 3.353118896484375, "step": 891 }, { "epoch": 0.2, "learning_rate": 9.877021267344087e-06, "logits/chosen": -1.2698758840560913, "logits/rejected": -1.0968464612960815, "logps/chosen": -223.19142150878906, "logps/rejected": -73.7725830078125, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 7.547502040863037, "rewards/margins": 3.6711575984954834, "rewards/rejected": 3.8763444423675537, "step": 892 }, { "epoch": 0.2, "learning_rate": 9.876625882688526e-06, "logits/chosen": -1.3181862831115723, "logits/rejected": -1.1901443004608154, "logps/chosen": -61.38874053955078, "logps/rejected": -45.74851608276367, "loss": 2.645, "rewards/accuracies": 1.0, "rewards/chosen": 2.368542432785034, "rewards/margins": 0.8868621587753296, "rewards/rejected": 1.4816802740097046, "step": 893 }, { "epoch": 0.2, "learning_rate": 9.876229871398263e-06, "logits/chosen": -0.775164008140564, "logits/rejected": -0.7336530089378357, "logps/chosen": -75.17811584472656, "logps/rejected": -107.75677490234375, "loss": 0.8618, "rewards/accuracies": 0.0, "rewards/chosen": 3.265040636062622, "rewards/margins": -0.9838488101959229, "rewards/rejected": 4.248889446258545, "step": 894 }, { "epoch": 0.2, "learning_rate": 9.875833233524183e-06, "logits/chosen": -0.9622606039047241, "logits/rejected": -0.8467841148376465, "logps/chosen": -64.34757995605469, "logps/rejected": -28.724214553833008, "loss": 0.186, "rewards/accuracies": 1.0, "rewards/chosen": 2.5009002685546875, "rewards/margins": 0.8268671035766602, "rewards/rejected": 1.6740331649780273, "step": 895 }, { "epoch": 0.2, "learning_rate": 9.875435969117254e-06, "logits/chosen": -1.0233917236328125, "logits/rejected": -0.9508096575737, "logps/chosen": -47.12201690673828, "logps/rejected": -68.0277099609375, "loss": 0.4875, "rewards/accuracies": 1.0, "rewards/chosen": 2.2123963832855225, "rewards/margins": 0.4140305519104004, "rewards/rejected": 1.798365831375122, "step": 896 }, { "epoch": 0.2, "learning_rate": 9.875038078228522e-06, "logits/chosen": -0.990327775478363, "logits/rejected": -0.6300172209739685, "logps/chosen": -102.76071166992188, "logps/rejected": -34.30131149291992, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 5.3086090087890625, "rewards/margins": 2.366628646850586, "rewards/rejected": 2.9419803619384766, "step": 897 }, { "epoch": 0.2, "learning_rate": 9.874639560909118e-06, "logits/chosen": -0.9219553470611572, "logits/rejected": -0.7967340350151062, "logps/chosen": -71.41189575195312, "logps/rejected": -48.09669494628906, "loss": 0.526, "rewards/accuracies": 0.0, "rewards/chosen": 1.9703766107559204, "rewards/margins": -0.6105576753616333, "rewards/rejected": 2.5809342861175537, "step": 898 }, { "epoch": 0.2, "learning_rate": 9.87424041721025e-06, "logits/chosen": -1.244856595993042, "logits/rejected": -1.285671591758728, "logps/chosen": -106.61233520507812, "logps/rejected": -79.23612213134766, "loss": 0.1743, "rewards/accuracies": 1.0, "rewards/chosen": 7.891751289367676, "rewards/margins": 0.8757972717285156, "rewards/rejected": 7.01595401763916, "step": 899 }, { "epoch": 0.2, "learning_rate": 9.873840647183204e-06, "logits/chosen": -1.0611798763275146, "logits/rejected": -1.1216222047805786, "logps/chosen": -31.379640579223633, "logps/rejected": -68.62344360351562, "loss": 1.1317, "rewards/accuracies": 0.0, "rewards/chosen": 1.286324143409729, "rewards/margins": -1.2471712827682495, "rewards/rejected": 2.5334954261779785, "step": 900 }, { "epoch": 0.2, "learning_rate": 9.87344025087935e-06, "logits/chosen": -1.009820818901062, "logits/rejected": -0.854017436504364, "logps/chosen": -53.95884323120117, "logps/rejected": -10.294412612915039, "loss": 0.394, "rewards/accuracies": 1.0, "rewards/chosen": 0.9384422302246094, "rewards/margins": 0.38845211267471313, "rewards/rejected": 0.5499901175498962, "step": 901 }, { "epoch": 0.2, "learning_rate": 9.87303922835014e-06, "logits/chosen": -1.193759799003601, "logits/rejected": -1.2201534509658813, "logps/chosen": -63.80722427368164, "logps/rejected": -62.337799072265625, "loss": 1.5172, "rewards/accuracies": 0.0, "rewards/chosen": 1.8582470417022705, "rewards/margins": -0.27600812911987305, "rewards/rejected": 2.1342551708221436, "step": 902 }, { "epoch": 0.2, "learning_rate": 9.872637579647105e-06, "logits/chosen": -1.5985572338104248, "logits/rejected": -1.5133436918258667, "logps/chosen": -80.18852996826172, "logps/rejected": -43.558876037597656, "loss": 0.9894, "rewards/accuracies": 1.0, "rewards/chosen": 4.649204254150391, "rewards/margins": 1.9194166660308838, "rewards/rejected": 2.729787588119507, "step": 903 }, { "epoch": 0.2, "learning_rate": 9.872235304821853e-06, "logits/chosen": -1.1264495849609375, "logits/rejected": -1.1083428859710693, "logps/chosen": -121.63741302490234, "logps/rejected": -65.8175048828125, "loss": 2.1901, "rewards/accuracies": 1.0, "rewards/chosen": 4.2725138664245605, "rewards/margins": 0.7364497184753418, "rewards/rejected": 3.5360641479492188, "step": 904 }, { "epoch": 0.2, "learning_rate": 9.871832403926077e-06, "logits/chosen": -1.111757516860962, "logits/rejected": -1.111757516860962, "logps/chosen": -59.09965133666992, "logps/rejected": -59.09965133666992, "loss": 1.2394, "rewards/accuracies": 0.0, "rewards/chosen": 3.3453030586242676, "rewards/margins": 0.0, "rewards/rejected": 3.3453030586242676, "step": 905 }, { "epoch": 0.2, "learning_rate": 9.871428877011549e-06, "logits/chosen": -1.5797637701034546, "logits/rejected": -1.431229829788208, "logps/chosen": -115.50456237792969, "logps/rejected": -125.53126525878906, "loss": 1.9579, "rewards/accuracies": 1.0, "rewards/chosen": 6.688304424285889, "rewards/margins": 0.6428790092468262, "rewards/rejected": 6.0454254150390625, "step": 906 }, { "epoch": 0.2, "learning_rate": 9.87102472413012e-06, "logits/chosen": -1.2950345277786255, "logits/rejected": -1.2640208005905151, "logps/chosen": -81.4270248413086, "logps/rejected": -60.617835998535156, "loss": 0.5702, "rewards/accuracies": 1.0, "rewards/chosen": 3.000046491622925, "rewards/margins": 0.17106842994689941, "rewards/rejected": 2.8289780616760254, "step": 907 }, { "epoch": 0.2, "learning_rate": 9.870619945333727e-06, "logits/chosen": -1.0288561582565308, "logits/rejected": -1.0054430961608887, "logps/chosen": -67.8797607421875, "logps/rejected": -67.23060607910156, "loss": 3.1545, "rewards/accuracies": 1.0, "rewards/chosen": 1.227986216545105, "rewards/margins": 0.07324302196502686, "rewards/rejected": 1.1547431945800781, "step": 908 }, { "epoch": 0.2, "learning_rate": 9.870214540674377e-06, "logits/chosen": -1.2875546216964722, "logits/rejected": -1.192115068435669, "logps/chosen": -41.45508575439453, "logps/rejected": -64.8526840209961, "loss": 1.935, "rewards/accuracies": 1.0, "rewards/chosen": 4.474725246429443, "rewards/margins": 1.0274419784545898, "rewards/rejected": 3.4472832679748535, "step": 909 }, { "epoch": 0.2, "learning_rate": 9.869808510204165e-06, "logits/chosen": -1.2781968116760254, "logits/rejected": -1.2694612741470337, "logps/chosen": -49.035301208496094, "logps/rejected": -44.32927703857422, "loss": 0.6398, "rewards/accuracies": 0.0, "rewards/chosen": 3.3447914123535156, "rewards/margins": -0.6518547534942627, "rewards/rejected": 3.9966461658477783, "step": 910 }, { "epoch": 0.2, "learning_rate": 9.869401853975268e-06, "logits/chosen": -1.1639970541000366, "logits/rejected": -1.114670991897583, "logps/chosen": -88.43963623046875, "logps/rejected": -51.71278381347656, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 7.958157539367676, "rewards/margins": 5.283442497253418, "rewards/rejected": 2.6747148036956787, "step": 911 }, { "epoch": 0.2, "learning_rate": 9.868994572039939e-06, "logits/chosen": -0.8477426171302795, "logits/rejected": -0.8608130812644958, "logps/chosen": -42.0197868347168, "logps/rejected": -70.42536163330078, "loss": 1.0429, "rewards/accuracies": 0.0, "rewards/chosen": 2.9880268573760986, "rewards/margins": -1.1072070598602295, "rewards/rejected": 4.095233917236328, "step": 912 }, { "epoch": 0.2, "learning_rate": 9.86858666445051e-06, "logits/chosen": -1.1868910789489746, "logits/rejected": -1.0806174278259277, "logps/chosen": -37.093257904052734, "logps/rejected": -32.60614776611328, "loss": 0.4851, "rewards/accuracies": 0.0, "rewards/chosen": 2.1768901348114014, "rewards/margins": -0.47138333320617676, "rewards/rejected": 2.648273468017578, "step": 913 }, { "epoch": 0.2, "learning_rate": 9.8681781312594e-06, "logits/chosen": -1.3663544654846191, "logits/rejected": -1.3516205549240112, "logps/chosen": -55.49370574951172, "logps/rejected": -86.93608093261719, "loss": 0.8322, "rewards/accuracies": 0.0, "rewards/chosen": 2.9891037940979004, "rewards/margins": -1.4280729293823242, "rewards/rejected": 4.417176723480225, "step": 914 }, { "epoch": 0.2, "learning_rate": 9.867768972519101e-06, "logits/chosen": -1.0108215808868408, "logits/rejected": -0.8966972827911377, "logps/chosen": -133.3651123046875, "logps/rejected": -43.52150344848633, "loss": 1.6178, "rewards/accuracies": 1.0, "rewards/chosen": 3.340977430343628, "rewards/margins": 1.4472755193710327, "rewards/rejected": 1.8937019109725952, "step": 915 }, { "epoch": 0.2, "learning_rate": 9.867359188282193e-06, "logits/chosen": -1.400147795677185, "logits/rejected": -1.391308307647705, "logps/chosen": -47.62093734741211, "logps/rejected": -80.67771911621094, "loss": 1.8479, "rewards/accuracies": 0.0, "rewards/chosen": 2.6052167415618896, "rewards/margins": -0.9232470989227295, "rewards/rejected": 3.528463840484619, "step": 916 }, { "epoch": 0.2, "learning_rate": 9.86694877860133e-06, "logits/chosen": -1.5046851634979248, "logits/rejected": -1.5216543674468994, "logps/chosen": -152.34820556640625, "logps/rejected": -145.39749145507812, "loss": 1.5538, "rewards/accuracies": 0.0, "rewards/chosen": 7.180363655090332, "rewards/margins": -1.507512092590332, "rewards/rejected": 8.687875747680664, "step": 917 }, { "epoch": 0.2, "learning_rate": 9.866537743529247e-06, "logits/chosen": -1.3844181299209595, "logits/rejected": -1.1197820901870728, "logps/chosen": -109.8786849975586, "logps/rejected": -43.13669967651367, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": 8.280976295471191, "rewards/margins": 6.993598937988281, "rewards/rejected": 1.2873772382736206, "step": 918 }, { "epoch": 0.2, "learning_rate": 9.866126083118765e-06, "logits/chosen": -1.5078811645507812, "logits/rejected": -1.5983524322509766, "logps/chosen": -44.850433349609375, "logps/rejected": -67.0674057006836, "loss": 3.1008, "rewards/accuracies": 0.0, "rewards/chosen": 3.5009162425994873, "rewards/margins": -4.906048774719238, "rewards/rejected": 8.406965255737305, "step": 919 }, { "epoch": 0.2, "learning_rate": 9.865713797422778e-06, "logits/chosen": -1.530657410621643, "logits/rejected": -1.3339322805404663, "logps/chosen": -89.42774200439453, "logps/rejected": -72.154296875, "loss": 0.8384, "rewards/accuracies": 0.0, "rewards/chosen": 1.5718361139297485, "rewards/margins": -0.9685622453689575, "rewards/rejected": 2.540398359298706, "step": 920 }, { "epoch": 0.2, "learning_rate": 9.865300886494264e-06, "logits/chosen": -1.1085679531097412, "logits/rejected": -1.1096196174621582, "logps/chosen": -55.39951705932617, "logps/rejected": -86.14967346191406, "loss": 1.6824, "rewards/accuracies": 0.0, "rewards/chosen": 3.4378116130828857, "rewards/margins": -3.18117356300354, "rewards/rejected": 6.618985176086426, "step": 921 }, { "epoch": 0.2, "learning_rate": 9.864887350386284e-06, "logits/chosen": -1.7229957580566406, "logits/rejected": -1.77863347530365, "logps/chosen": -45.49195861816406, "logps/rejected": -108.00999450683594, "loss": 3.9828, "rewards/accuracies": 0.0, "rewards/chosen": 3.893216848373413, "rewards/margins": -2.3669960498809814, "rewards/rejected": 6.2602128982543945, "step": 922 }, { "epoch": 0.2, "learning_rate": 9.864473189151972e-06, "logits/chosen": -0.9004290103912354, "logits/rejected": -0.9031553864479065, "logps/chosen": -12.033452033996582, "logps/rejected": -5.173587799072266, "loss": 0.5153, "rewards/accuracies": 0.0, "rewards/chosen": 0.42053595185279846, "rewards/margins": -0.544653058052063, "rewards/rejected": 0.9651889801025391, "step": 923 }, { "epoch": 0.2, "learning_rate": 9.864058402844553e-06, "logits/chosen": -1.0888097286224365, "logits/rejected": -1.1075818538665771, "logps/chosen": -35.43357467651367, "logps/rejected": -60.27855682373047, "loss": 3.3049, "rewards/accuracies": 1.0, "rewards/chosen": 2.672457456588745, "rewards/margins": 0.8308795690536499, "rewards/rejected": 1.8415778875350952, "step": 924 }, { "epoch": 0.2, "learning_rate": 9.863642991517317e-06, "logits/chosen": -1.3354032039642334, "logits/rejected": -1.3064173460006714, "logps/chosen": -47.55595016479492, "logps/rejected": -57.54764175415039, "loss": 0.64, "rewards/accuracies": 0.0, "rewards/chosen": 1.8685063123703003, "rewards/margins": -0.8973151445388794, "rewards/rejected": 2.7658214569091797, "step": 925 }, { "epoch": 0.2, "learning_rate": 9.863226955223653e-06, "logits/chosen": -1.2232180833816528, "logits/rejected": -1.0000207424163818, "logps/chosen": -109.7607421875, "logps/rejected": -51.026817321777344, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": 5.770806789398193, "rewards/margins": 3.1439948081970215, "rewards/rejected": 2.626811981201172, "step": 926 }, { "epoch": 0.21, "learning_rate": 9.862810294017014e-06, "logits/chosen": -1.5561755895614624, "logits/rejected": -1.6108171939849854, "logps/chosen": -76.12773132324219, "logps/rejected": -140.07131958007812, "loss": 1.1607, "rewards/accuracies": 0.0, "rewards/chosen": 4.9428863525390625, "rewards/margins": -1.2709627151489258, "rewards/rejected": 6.213849067687988, "step": 927 }, { "epoch": 0.21, "learning_rate": 9.86239300795094e-06, "logits/chosen": -0.9572544097900391, "logits/rejected": -0.9149156808853149, "logps/chosen": -47.30509948730469, "logps/rejected": -44.368900299072266, "loss": 1.3055, "rewards/accuracies": 0.0, "rewards/chosen": 0.6302497982978821, "rewards/margins": -2.080212116241455, "rewards/rejected": 2.7104618549346924, "step": 928 }, { "epoch": 0.21, "learning_rate": 9.861975097079057e-06, "logits/chosen": -0.9941607713699341, "logits/rejected": -1.2191652059555054, "logps/chosen": -40.40083312988281, "logps/rejected": -73.59488677978516, "loss": 0.42, "rewards/accuracies": 1.0, "rewards/chosen": 1.4619735479354858, "rewards/margins": 1.1372809410095215, "rewards/rejected": 0.3246925473213196, "step": 929 }, { "epoch": 0.21, "learning_rate": 9.861556561455061e-06, "logits/chosen": -1.3703361749649048, "logits/rejected": -1.4705753326416016, "logps/chosen": -66.70791625976562, "logps/rejected": -105.2668228149414, "loss": 0.9733, "rewards/accuracies": 0.0, "rewards/chosen": 5.568200588226318, "rewards/margins": -1.7025232315063477, "rewards/rejected": 7.270723819732666, "step": 930 }, { "epoch": 0.21, "learning_rate": 9.861137401132733e-06, "logits/chosen": -1.4505666494369507, "logits/rejected": -1.3009346723556519, "logps/chosen": -99.87564086914062, "logps/rejected": -39.68115997314453, "loss": 0.4235, "rewards/accuracies": 1.0, "rewards/chosen": 7.5037689208984375, "rewards/margins": 5.5470733642578125, "rewards/rejected": 1.956695556640625, "step": 931 }, { "epoch": 0.21, "learning_rate": 9.860717616165934e-06, "logits/chosen": -1.1895232200622559, "logits/rejected": -1.0554460287094116, "logps/chosen": -73.32223510742188, "logps/rejected": -98.22264099121094, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 8.529547691345215, "rewards/margins": 6.952199459075928, "rewards/rejected": 1.5773483514785767, "step": 932 }, { "epoch": 0.21, "learning_rate": 9.860297206608606e-06, "logits/chosen": -0.9255294799804688, "logits/rejected": -0.8907206654548645, "logps/chosen": -69.5523681640625, "logps/rejected": -90.24752807617188, "loss": 1.1927, "rewards/accuracies": 0.0, "rewards/chosen": 2.3854644298553467, "rewards/margins": -1.0244979858398438, "rewards/rejected": 3.4099624156951904, "step": 933 }, { "epoch": 0.21, "learning_rate": 9.859876172514773e-06, "logits/chosen": -1.4601914882659912, "logits/rejected": -1.457285761833191, "logps/chosen": -73.25069427490234, "logps/rejected": -187.73934936523438, "loss": 3.2019, "rewards/accuracies": 0.0, "rewards/chosen": 2.7267906665802, "rewards/margins": -6.369422912597656, "rewards/rejected": 9.096213340759277, "step": 934 }, { "epoch": 0.21, "learning_rate": 9.859454513938534e-06, "logits/chosen": -1.4661635160446167, "logits/rejected": -1.4429465532302856, "logps/chosen": -138.65066528320312, "logps/rejected": -95.14086151123047, "loss": 0.6311, "rewards/accuracies": 1.0, "rewards/chosen": 5.1793060302734375, "rewards/margins": 0.7497978210449219, "rewards/rejected": 4.429508209228516, "step": 935 }, { "epoch": 0.21, "learning_rate": 9.859032230934071e-06, "logits/chosen": -1.045000672340393, "logits/rejected": -1.0176106691360474, "logps/chosen": -32.23161697387695, "logps/rejected": -19.834613800048828, "loss": 0.4505, "rewards/accuracies": 0.0, "rewards/chosen": 1.820211410522461, "rewards/margins": -0.30925679206848145, "rewards/rejected": 2.1294682025909424, "step": 936 }, { "epoch": 0.21, "learning_rate": 9.858609323555646e-06, "logits/chosen": -1.4694499969482422, "logits/rejected": -1.4185330867767334, "logps/chosen": -30.57219696044922, "logps/rejected": -41.24454879760742, "loss": 0.3897, "rewards/accuracies": 1.0, "rewards/chosen": 2.4432480335235596, "rewards/margins": 0.14667391777038574, "rewards/rejected": 2.296574115753174, "step": 937 }, { "epoch": 0.21, "learning_rate": 9.858185791857604e-06, "logits/chosen": -1.3669925928115845, "logits/rejected": -1.078959584236145, "logps/chosen": -152.93621826171875, "logps/rejected": -25.020946502685547, "loss": 1.3251, "rewards/accuracies": 1.0, "rewards/chosen": 7.683366298675537, "rewards/margins": 6.542492389678955, "rewards/rejected": 1.1408737897872925, "step": 938 }, { "epoch": 0.21, "learning_rate": 9.857761635894367e-06, "logits/chosen": -1.3411682844161987, "logits/rejected": -1.079909086227417, "logps/chosen": -129.79283142089844, "logps/rejected": -76.58189392089844, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 5.87001371383667, "rewards/margins": 4.382215976715088, "rewards/rejected": 1.4877976179122925, "step": 939 }, { "epoch": 0.21, "learning_rate": 9.857336855720439e-06, "logits/chosen": -0.7498910427093506, "logits/rejected": -0.7525376677513123, "logps/chosen": -6.1331892013549805, "logps/rejected": -8.724863052368164, "loss": 0.8295, "rewards/accuracies": 0.0, "rewards/chosen": 0.7627894282341003, "rewards/margins": -0.4751366972923279, "rewards/rejected": 1.2379261255264282, "step": 940 }, { "epoch": 0.21, "learning_rate": 9.856911451390399e-06, "logits/chosen": -1.4175657033920288, "logits/rejected": -1.2269792556762695, "logps/chosen": -129.38143920898438, "logps/rejected": -62.21339416503906, "loss": 0.7688, "rewards/accuracies": 1.0, "rewards/chosen": 5.222329616546631, "rewards/margins": 2.1683194637298584, "rewards/rejected": 3.0540101528167725, "step": 941 }, { "epoch": 0.21, "learning_rate": 9.856485422958913e-06, "logits/chosen": -1.496389627456665, "logits/rejected": -1.5382084846496582, "logps/chosen": -66.31890869140625, "logps/rejected": -136.3916778564453, "loss": 2.6798, "rewards/accuracies": 0.0, "rewards/chosen": 2.827338457107544, "rewards/margins": -5.347888946533203, "rewards/rejected": 8.175227165222168, "step": 942 }, { "epoch": 0.21, "learning_rate": 9.856058770480726e-06, "logits/chosen": -1.1193006038665771, "logits/rejected": -1.055798888206482, "logps/chosen": -42.56578826904297, "logps/rejected": -43.38157653808594, "loss": 1.3137, "rewards/accuracies": 0.0, "rewards/chosen": 1.4782012701034546, "rewards/margins": -0.18162083625793457, "rewards/rejected": 1.6598221063613892, "step": 943 }, { "epoch": 0.21, "learning_rate": 9.855631494010661e-06, "logits/chosen": -1.2985435724258423, "logits/rejected": -1.2170023918151855, "logps/chosen": -70.80323791503906, "logps/rejected": -87.02818298339844, "loss": 1.1343, "rewards/accuracies": 0.0, "rewards/chosen": 2.840946912765503, "rewards/margins": -2.01932692527771, "rewards/rejected": 4.860273838043213, "step": 944 }, { "epoch": 0.21, "learning_rate": 9.855203593603622e-06, "logits/chosen": -1.137264370918274, "logits/rejected": -1.1602972745895386, "logps/chosen": -71.15135192871094, "logps/rejected": -77.60265350341797, "loss": 1.1735, "rewards/accuracies": 0.0, "rewards/chosen": 2.034132480621338, "rewards/margins": -1.0979087352752686, "rewards/rejected": 3.1320412158966064, "step": 945 }, { "epoch": 0.21, "learning_rate": 9.85477506931459e-06, "logits/chosen": -0.9324562549591064, "logits/rejected": -0.9024081826210022, "logps/chosen": -31.960975646972656, "logps/rejected": -55.73522186279297, "loss": 1.3663, "rewards/accuracies": 0.0, "rewards/chosen": 1.4430763721466064, "rewards/margins": -1.6276192665100098, "rewards/rejected": 3.070695638656616, "step": 946 }, { "epoch": 0.21, "learning_rate": 9.854345921198637e-06, "logits/chosen": -0.97198486328125, "logits/rejected": -0.594547688961029, "logps/chosen": -27.32211685180664, "logps/rejected": -98.81289672851562, "loss": 1.9822, "rewards/accuracies": 0.0, "rewards/chosen": 1.4733387231826782, "rewards/margins": -3.855428695678711, "rewards/rejected": 5.3287672996521, "step": 947 }, { "epoch": 0.21, "learning_rate": 9.853916149310898e-06, "logits/chosen": -1.8376449346542358, "logits/rejected": -1.7336671352386475, "logps/chosen": -62.804630279541016, "logps/rejected": -29.713153839111328, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 4.386430740356445, "rewards/margins": 3.0970816612243652, "rewards/rejected": 1.2893489599227905, "step": 948 }, { "epoch": 0.21, "learning_rate": 9.853485753706603e-06, "logits/chosen": -0.7973731160163879, "logits/rejected": -0.8317925333976746, "logps/chosen": -45.827911376953125, "logps/rejected": -57.24903106689453, "loss": 0.6254, "rewards/accuracies": 0.0, "rewards/chosen": 3.239147901535034, "rewards/margins": -0.4815187454223633, "rewards/rejected": 3.7206666469573975, "step": 949 }, { "epoch": 0.21, "learning_rate": 9.853054734441059e-06, "logits/chosen": -0.9754467010498047, "logits/rejected": -0.9400728940963745, "logps/chosen": -35.422142028808594, "logps/rejected": -35.89314651489258, "loss": 0.295, "rewards/accuracies": 1.0, "rewards/chosen": 2.971721649169922, "rewards/margins": 1.173920750617981, "rewards/rejected": 1.797800898551941, "step": 950 }, { "epoch": 0.21, "learning_rate": 9.852623091569646e-06, "logits/chosen": -1.272020936012268, "logits/rejected": -1.2647008895874023, "logps/chosen": -33.10221862792969, "logps/rejected": -23.516483306884766, "loss": 0.1509, "rewards/accuracies": 1.0, "rewards/chosen": 3.3056931495666504, "rewards/margins": 1.1336257457733154, "rewards/rejected": 2.172067403793335, "step": 951 }, { "epoch": 0.21, "learning_rate": 9.852190825147831e-06, "logits/chosen": -1.2635544538497925, "logits/rejected": -1.0479379892349243, "logps/chosen": -90.24613952636719, "logps/rejected": -31.010108947753906, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": 4.230520725250244, "rewards/margins": 3.0929408073425293, "rewards/rejected": 1.1375797986984253, "step": 952 }, { "epoch": 0.21, "learning_rate": 9.85175793523116e-06, "logits/chosen": -0.8800166249275208, "logits/rejected": -0.8915161490440369, "logps/chosen": -71.62702178955078, "logps/rejected": -70.93534088134766, "loss": 0.3873, "rewards/accuracies": 1.0, "rewards/chosen": 2.922320604324341, "rewards/margins": 0.3596787452697754, "rewards/rejected": 2.5626418590545654, "step": 953 }, { "epoch": 0.21, "learning_rate": 9.851324421875256e-06, "logits/chosen": -1.030713677406311, "logits/rejected": -0.8789390325546265, "logps/chosen": -111.8838882446289, "logps/rejected": -38.460121154785156, "loss": 1.3175, "rewards/accuracies": 1.0, "rewards/chosen": 3.8102777004241943, "rewards/margins": 1.1281113624572754, "rewards/rejected": 2.682166337966919, "step": 954 }, { "epoch": 0.21, "learning_rate": 9.850890285135829e-06, "logits/chosen": -1.8428817987442017, "logits/rejected": -1.7302796840667725, "logps/chosen": -185.79525756835938, "logps/rejected": -133.323486328125, "loss": 1.1271, "rewards/accuracies": 1.0, "rewards/chosen": 5.709892272949219, "rewards/margins": 2.789454698562622, "rewards/rejected": 2.9204375743865967, "step": 955 }, { "epoch": 0.21, "learning_rate": 9.850455525068658e-06, "logits/chosen": -1.065055251121521, "logits/rejected": -1.0224202871322632, "logps/chosen": -50.597164154052734, "logps/rejected": -64.93708801269531, "loss": 0.3717, "rewards/accuracies": 0.0, "rewards/chosen": 2.371427536010742, "rewards/margins": -0.08265280723571777, "rewards/rejected": 2.45408034324646, "step": 956 }, { "epoch": 0.21, "learning_rate": 9.850020141729615e-06, "logits/chosen": -1.1425572633743286, "logits/rejected": -1.0637096166610718, "logps/chosen": -64.55359649658203, "logps/rejected": -33.530860900878906, "loss": 0.2531, "rewards/accuracies": 1.0, "rewards/chosen": 2.6797006130218506, "rewards/margins": 0.4454784393310547, "rewards/rejected": 2.234222173690796, "step": 957 }, { "epoch": 0.21, "learning_rate": 9.849584135174642e-06, "logits/chosen": -1.3937386274337769, "logits/rejected": -1.3428009748458862, "logps/chosen": -102.66581726074219, "logps/rejected": -46.59478759765625, "loss": 0.8189, "rewards/accuracies": 1.0, "rewards/chosen": 4.254878520965576, "rewards/margins": 1.1515557765960693, "rewards/rejected": 3.103322744369507, "step": 958 }, { "epoch": 0.21, "learning_rate": 9.849147505459766e-06, "logits/chosen": -1.0331372022628784, "logits/rejected": -0.9974143505096436, "logps/chosen": -37.543235778808594, "logps/rejected": -33.93455123901367, "loss": 2.0017, "rewards/accuracies": 1.0, "rewards/chosen": 2.4398484230041504, "rewards/margins": 2.010728597640991, "rewards/rejected": 0.42911988496780396, "step": 959 }, { "epoch": 0.21, "learning_rate": 9.848710252641092e-06, "logits/chosen": -1.232578158378601, "logits/rejected": -1.1970391273498535, "logps/chosen": -138.64964294433594, "logps/rejected": -149.4379425048828, "loss": 1.235, "rewards/accuracies": 0.0, "rewards/chosen": 5.6946868896484375, "rewards/margins": -1.4138565063476562, "rewards/rejected": 7.108543395996094, "step": 960 }, { "epoch": 0.21, "learning_rate": 9.848272376774807e-06, "logits/chosen": -0.4860249161720276, "logits/rejected": -0.4860249161720276, "logps/chosen": -7.502741813659668, "logps/rejected": -7.502741813659668, "loss": 0.4208, "rewards/accuracies": 0.0, "rewards/chosen": 0.4885101318359375, "rewards/margins": 0.0, "rewards/rejected": 0.4885101318359375, "step": 961 }, { "epoch": 0.21, "learning_rate": 9.847833877917177e-06, "logits/chosen": -1.5656108856201172, "logits/rejected": -1.5325297117233276, "logps/chosen": -57.099735260009766, "logps/rejected": -74.53224182128906, "loss": 0.4293, "rewards/accuracies": 1.0, "rewards/chosen": 3.341674566268921, "rewards/margins": 0.678574800491333, "rewards/rejected": 2.663099765777588, "step": 962 }, { "epoch": 0.21, "learning_rate": 9.847394756124547e-06, "logits/chosen": -1.3165428638458252, "logits/rejected": -1.3054492473602295, "logps/chosen": -72.95924377441406, "logps/rejected": -98.60094451904297, "loss": 1.3509, "rewards/accuracies": 0.0, "rewards/chosen": 2.689195394515991, "rewards/margins": -2.590712785720825, "rewards/rejected": 5.279908180236816, "step": 963 }, { "epoch": 0.21, "learning_rate": 9.846955011453343e-06, "logits/chosen": -1.2126235961914062, "logits/rejected": -1.2126235961914062, "logps/chosen": -42.910919189453125, "logps/rejected": -42.910919189453125, "loss": 0.3522, "rewards/accuracies": 0.0, "rewards/chosen": 0.9201904535293579, "rewards/margins": 0.0, "rewards/rejected": 0.9201904535293579, "step": 964 }, { "epoch": 0.21, "learning_rate": 9.846514643960072e-06, "logits/chosen": -1.3315221071243286, "logits/rejected": -1.3035019636154175, "logps/chosen": -43.27899169921875, "logps/rejected": -57.18780517578125, "loss": 1.783, "rewards/accuracies": 1.0, "rewards/chosen": 3.3033790588378906, "rewards/margins": 0.1623084545135498, "rewards/rejected": 3.141070604324341, "step": 965 }, { "epoch": 0.21, "learning_rate": 9.846073653701321e-06, "logits/chosen": -1.1302893161773682, "logits/rejected": -1.0540300607681274, "logps/chosen": -62.054100036621094, "logps/rejected": -73.80943298339844, "loss": 0.4687, "rewards/accuracies": 0.0, "rewards/chosen": 2.960312604904175, "rewards/margins": -0.29033446311950684, "rewards/rejected": 3.2506470680236816, "step": 966 }, { "epoch": 0.21, "learning_rate": 9.845632040733754e-06, "logits/chosen": -1.401376485824585, "logits/rejected": -1.1906884908676147, "logps/chosen": -116.87952423095703, "logps/rejected": -54.53767013549805, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 10.800878524780273, "rewards/margins": 7.725425720214844, "rewards/rejected": 3.0754528045654297, "step": 967 }, { "epoch": 0.21, "learning_rate": 9.845189805114119e-06, "logits/chosen": -1.7329986095428467, "logits/rejected": -1.71755850315094, "logps/chosen": -140.33615112304688, "logps/rejected": -68.97087097167969, "loss": 1.542, "rewards/accuracies": 1.0, "rewards/chosen": 4.169226169586182, "rewards/margins": 0.21452641487121582, "rewards/rejected": 3.954699754714966, "step": 968 }, { "epoch": 0.21, "learning_rate": 9.844746946899241e-06, "logits/chosen": -1.2358086109161377, "logits/rejected": -1.2325787544250488, "logps/chosen": -53.14905548095703, "logps/rejected": -59.215293884277344, "loss": 0.2025, "rewards/accuracies": 1.0, "rewards/chosen": 3.5361106395721436, "rewards/margins": 1.0133864879608154, "rewards/rejected": 2.522724151611328, "step": 969 }, { "epoch": 0.21, "learning_rate": 9.844303466146027e-06, "logits/chosen": -1.4883280992507935, "logits/rejected": -1.2955294847488403, "logps/chosen": -105.55513000488281, "logps/rejected": -47.24407958984375, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 5.586133003234863, "rewards/margins": 5.359425067901611, "rewards/rejected": 0.2267078459262848, "step": 970 }, { "epoch": 0.21, "learning_rate": 9.843859362911463e-06, "logits/chosen": -1.2333377599716187, "logits/rejected": -1.16048002243042, "logps/chosen": -67.0320816040039, "logps/rejected": -48.696781158447266, "loss": 1.1434, "rewards/accuracies": 0.0, "rewards/chosen": 2.601691484451294, "rewards/margins": -1.8387172222137451, "rewards/rejected": 4.440408706665039, "step": 971 }, { "epoch": 0.22, "learning_rate": 9.843414637252615e-06, "logits/chosen": -1.597141146659851, "logits/rejected": -1.535277009010315, "logps/chosen": -133.50914001464844, "logps/rejected": -148.67593383789062, "loss": 2.1894, "rewards/accuracies": 0.0, "rewards/chosen": 7.111448764801025, "rewards/margins": -1.1071972846984863, "rewards/rejected": 8.218646049499512, "step": 972 }, { "epoch": 0.22, "learning_rate": 9.842969289226629e-06, "logits/chosen": -0.8995601534843445, "logits/rejected": -0.9380433559417725, "logps/chosen": -26.906164169311523, "logps/rejected": -34.21868896484375, "loss": 0.5867, "rewards/accuracies": 0.0, "rewards/chosen": 2.2734968662261963, "rewards/margins": -0.7114617824554443, "rewards/rejected": 2.9849586486816406, "step": 973 }, { "epoch": 0.22, "learning_rate": 9.842523318890733e-06, "logits/chosen": -1.6833324432373047, "logits/rejected": -1.74143648147583, "logps/chosen": -73.33482360839844, "logps/rejected": -147.49917602539062, "loss": 1.2256, "rewards/accuracies": 0.0, "rewards/chosen": 3.0044524669647217, "rewards/margins": -2.3517119884490967, "rewards/rejected": 5.356164455413818, "step": 974 }, { "epoch": 0.22, "learning_rate": 9.84207672630223e-06, "logits/chosen": -1.29102623462677, "logits/rejected": -1.3164801597595215, "logps/chosen": -45.616371154785156, "logps/rejected": -74.79885864257812, "loss": 0.2711, "rewards/accuracies": 1.0, "rewards/chosen": 2.572446584701538, "rewards/margins": 0.39771199226379395, "rewards/rejected": 2.174734592437744, "step": 975 }, { "epoch": 0.22, "learning_rate": 9.84162951151851e-06, "logits/chosen": -1.2885442972183228, "logits/rejected": -1.3657550811767578, "logps/chosen": -75.90641784667969, "logps/rejected": -88.78567504882812, "loss": 1.4819, "rewards/accuracies": 0.0, "rewards/chosen": 2.809530019760132, "rewards/margins": -2.8869802951812744, "rewards/rejected": 5.696510314941406, "step": 976 }, { "epoch": 0.22, "learning_rate": 9.841181674597034e-06, "logits/chosen": -1.340554118156433, "logits/rejected": -1.4034119844436646, "logps/chosen": -52.68168640136719, "logps/rejected": -44.50009536743164, "loss": 0.9482, "rewards/accuracies": 0.0, "rewards/chosen": 3.0714080333709717, "rewards/margins": -1.0029966831207275, "rewards/rejected": 4.074404716491699, "step": 977 }, { "epoch": 0.22, "learning_rate": 9.840733215595351e-06, "logits/chosen": -1.4508854150772095, "logits/rejected": -1.437147617340088, "logps/chosen": -56.72785186767578, "logps/rejected": -56.65116882324219, "loss": 0.2507, "rewards/accuracies": 1.0, "rewards/chosen": 2.8171334266662598, "rewards/margins": 0.6754767894744873, "rewards/rejected": 2.1416566371917725, "step": 978 }, { "epoch": 0.22, "learning_rate": 9.840284134571088e-06, "logits/chosen": -1.1317979097366333, "logits/rejected": -1.152156114578247, "logps/chosen": -37.50780487060547, "logps/rejected": -34.298194885253906, "loss": 0.4062, "rewards/accuracies": 0.0, "rewards/chosen": 2.4583892822265625, "rewards/margins": -0.11737823486328125, "rewards/rejected": 2.5757675170898438, "step": 979 }, { "epoch": 0.22, "learning_rate": 9.83983443158195e-06, "logits/chosen": -1.5652501583099365, "logits/rejected": -1.4195045232772827, "logps/chosen": -117.3330078125, "logps/rejected": -21.64809226989746, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 6.702661037445068, "rewards/margins": 5.870746612548828, "rewards/rejected": 0.8319145441055298, "step": 980 }, { "epoch": 0.22, "learning_rate": 9.839384106685721e-06, "logits/chosen": -1.61404287815094, "logits/rejected": -1.529122233390808, "logps/chosen": -59.226959228515625, "logps/rejected": -32.65095138549805, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 1.7115013599395752, "rewards/margins": 1.3695820569992065, "rewards/rejected": 0.34191933274269104, "step": 981 }, { "epoch": 0.22, "learning_rate": 9.838933159940266e-06, "logits/chosen": -1.3424698114395142, "logits/rejected": -1.327730655670166, "logps/chosen": -45.136253356933594, "logps/rejected": -56.55291748046875, "loss": 1.0706, "rewards/accuracies": 1.0, "rewards/chosen": 2.1659278869628906, "rewards/margins": 0.3197890520095825, "rewards/rejected": 1.846138834953308, "step": 982 }, { "epoch": 0.22, "learning_rate": 9.838481591403536e-06, "logits/chosen": -1.4661741256713867, "logits/rejected": -1.3062355518341064, "logps/chosen": -67.34892272949219, "logps/rejected": -32.17656707763672, "loss": 2.1418, "rewards/accuracies": 1.0, "rewards/chosen": 3.35205078125, "rewards/margins": 2.3707823753356934, "rewards/rejected": 0.9812683463096619, "step": 983 }, { "epoch": 0.22, "learning_rate": 9.83802940113355e-06, "logits/chosen": -0.9698315858840942, "logits/rejected": -0.8867858648300171, "logps/chosen": -68.3580551147461, "logps/rejected": -64.606201171875, "loss": 0.8605, "rewards/accuracies": 0.0, "rewards/chosen": 1.923317790031433, "rewards/margins": -1.500771403312683, "rewards/rejected": 3.424089193344116, "step": 984 }, { "epoch": 0.22, "learning_rate": 9.837576589188418e-06, "logits/chosen": -1.1240434646606445, "logits/rejected": -1.1171784400939941, "logps/chosen": -68.37639617919922, "logps/rejected": -81.6307373046875, "loss": 2.3596, "rewards/accuracies": 0.0, "rewards/chosen": 2.2567756175994873, "rewards/margins": -3.2748100757598877, "rewards/rejected": 5.531585693359375, "step": 985 }, { "epoch": 0.22, "learning_rate": 9.837123155626323e-06, "logits/chosen": -1.1267749071121216, "logits/rejected": -0.8376168608665466, "logps/chosen": -125.03794860839844, "logps/rejected": -49.85720443725586, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 6.56112813949585, "rewards/margins": 6.471041202545166, "rewards/rejected": 0.09008675068616867, "step": 986 }, { "epoch": 0.22, "learning_rate": 9.836669100505532e-06, "logits/chosen": -1.2715684175491333, "logits/rejected": -1.2484573125839233, "logps/chosen": -44.02788543701172, "logps/rejected": -60.428985595703125, "loss": 2.3386, "rewards/accuracies": 0.0, "rewards/chosen": 2.5634407997131348, "rewards/margins": -1.692641258239746, "rewards/rejected": 4.256082057952881, "step": 987 }, { "epoch": 0.22, "learning_rate": 9.836214423884387e-06, "logits/chosen": -1.32893967628479, "logits/rejected": -1.194423794746399, "logps/chosen": -62.14335632324219, "logps/rejected": -32.51782989501953, "loss": 0.5201, "rewards/accuracies": 1.0, "rewards/chosen": 3.7464516162872314, "rewards/margins": 3.2433979511260986, "rewards/rejected": 0.5030536651611328, "step": 988 }, { "epoch": 0.22, "learning_rate": 9.835759125821314e-06, "logits/chosen": -1.3790568113327026, "logits/rejected": -1.4498454332351685, "logps/chosen": -47.22383117675781, "logps/rejected": -93.97216796875, "loss": 2.2316, "rewards/accuracies": 0.0, "rewards/chosen": 2.283041477203369, "rewards/margins": -4.338263034820557, "rewards/rejected": 6.621304512023926, "step": 989 }, { "epoch": 0.22, "learning_rate": 9.83530320637482e-06, "logits/chosen": -1.2115229368209839, "logits/rejected": -1.168251633644104, "logps/chosen": -76.8206558227539, "logps/rejected": -114.75331115722656, "loss": 2.7817, "rewards/accuracies": 1.0, "rewards/chosen": 4.966977596282959, "rewards/margins": 1.9956214427947998, "rewards/rejected": 2.971356153488159, "step": 990 }, { "epoch": 0.22, "learning_rate": 9.834846665603486e-06, "logits/chosen": -0.8994433283805847, "logits/rejected": -0.9099751710891724, "logps/chosen": -50.46969985961914, "logps/rejected": -83.4305648803711, "loss": 0.7147, "rewards/accuracies": 0.0, "rewards/chosen": 1.7779873609542847, "rewards/margins": -1.1419917345046997, "rewards/rejected": 2.9199790954589844, "step": 991 }, { "epoch": 0.22, "learning_rate": 9.834389503565978e-06, "logits/chosen": -0.9762993454933167, "logits/rejected": -1.1391022205352783, "logps/chosen": -66.63432312011719, "logps/rejected": -94.65196228027344, "loss": 3.5885, "rewards/accuracies": 0.0, "rewards/chosen": 1.8822952508926392, "rewards/margins": -7.11433744430542, "rewards/rejected": 8.99663257598877, "step": 992 }, { "epoch": 0.22, "learning_rate": 9.833931720321042e-06, "logits/chosen": -1.3635884523391724, "logits/rejected": -1.2684062719345093, "logps/chosen": -112.79358673095703, "logps/rejected": -77.06272888183594, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": 7.513317108154297, "rewards/margins": 3.344696521759033, "rewards/rejected": 4.168620586395264, "step": 993 }, { "epoch": 0.22, "learning_rate": 9.833473315927498e-06, "logits/chosen": -1.5049970149993896, "logits/rejected": -1.2377488613128662, "logps/chosen": -143.6363067626953, "logps/rejected": -121.72844696044922, "loss": 1.1238, "rewards/accuracies": 0.0, "rewards/chosen": 3.4086837768554688, "rewards/margins": -2.1061301231384277, "rewards/rejected": 5.5148138999938965, "step": 994 }, { "epoch": 0.22, "learning_rate": 9.833014290444254e-06, "logits/chosen": -1.2941303253173828, "logits/rejected": -1.269573450088501, "logps/chosen": -81.12396240234375, "logps/rejected": -121.4183349609375, "loss": 1.7298, "rewards/accuracies": 0.0, "rewards/chosen": 2.8994140625, "rewards/margins": -3.425454616546631, "rewards/rejected": 6.324868679046631, "step": 995 }, { "epoch": 0.22, "learning_rate": 9.832554643930292e-06, "logits/chosen": -1.138408899307251, "logits/rejected": -1.012860894203186, "logps/chosen": -50.66107940673828, "logps/rejected": -25.541227340698242, "loss": 0.2605, "rewards/accuracies": 1.0, "rewards/chosen": 4.103180885314941, "rewards/margins": 3.1446692943573, "rewards/rejected": 0.9585115313529968, "step": 996 }, { "epoch": 0.22, "learning_rate": 9.832094376444675e-06, "logits/chosen": -1.136370301246643, "logits/rejected": -1.1706420183181763, "logps/chosen": -58.80876159667969, "logps/rejected": -141.7552490234375, "loss": 0.9925, "rewards/accuracies": 0.0, "rewards/chosen": 1.7853561639785767, "rewards/margins": -1.3444381952285767, "rewards/rejected": 3.1297943592071533, "step": 997 }, { "epoch": 0.22, "learning_rate": 9.831633488046547e-06, "logits/chosen": -1.3076586723327637, "logits/rejected": -1.3208720684051514, "logps/chosen": -114.9757080078125, "logps/rejected": -84.28488159179688, "loss": 0.4434, "rewards/accuracies": 1.0, "rewards/chosen": 4.7602691650390625, "rewards/margins": 0.24931764602661133, "rewards/rejected": 4.510951519012451, "step": 998 }, { "epoch": 0.22, "learning_rate": 9.83117197879513e-06, "logits/chosen": -1.7017619609832764, "logits/rejected": -1.7056758403778076, "logps/chosen": -102.61933898925781, "logps/rejected": -90.3292236328125, "loss": 0.9652, "rewards/accuracies": 0.0, "rewards/chosen": 5.981471538543701, "rewards/margins": -1.2253189086914062, "rewards/rejected": 7.206790447235107, "step": 999 }, { "epoch": 0.22, "learning_rate": 9.830709848749727e-06, "logits/chosen": -1.3999241590499878, "logits/rejected": -1.3268779516220093, "logps/chosen": -189.48406982421875, "logps/rejected": -99.82351684570312, "loss": 0.3324, "rewards/accuracies": 1.0, "rewards/chosen": 8.300082206726074, "rewards/margins": 0.27938175201416016, "rewards/rejected": 8.020700454711914, "step": 1000 }, { "epoch": 0.22, "learning_rate": 9.830247097969723e-06, "logits/chosen": -1.51558518409729, "logits/rejected": -1.419700026512146, "logps/chosen": -135.50289916992188, "logps/rejected": -100.94331359863281, "loss": 0.6493, "rewards/accuracies": 1.0, "rewards/chosen": 3.6291701793670654, "rewards/margins": 0.69476318359375, "rewards/rejected": 2.9344069957733154, "step": 1001 }, { "epoch": 0.22, "learning_rate": 9.829783726514578e-06, "logits/chosen": -1.3466449975967407, "logits/rejected": -1.2643357515335083, "logps/chosen": -66.07134246826172, "logps/rejected": -54.78988265991211, "loss": 1.733, "rewards/accuracies": 0.0, "rewards/chosen": 2.444974660873413, "rewards/margins": -1.2626476287841797, "rewards/rejected": 3.7076222896575928, "step": 1002 }, { "epoch": 0.22, "learning_rate": 9.829319734443833e-06, "logits/chosen": -1.1714714765548706, "logits/rejected": -1.0579125881195068, "logps/chosen": -79.32814025878906, "logps/rejected": -94.01473999023438, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": 7.951176643371582, "rewards/margins": 3.1358261108398438, "rewards/rejected": 4.815350532531738, "step": 1003 }, { "epoch": 0.22, "learning_rate": 9.828855121817114e-06, "logits/chosen": -1.1375845670700073, "logits/rejected": -1.0088905096054077, "logps/chosen": -61.402957916259766, "logps/rejected": -51.55635452270508, "loss": 0.3255, "rewards/accuracies": 1.0, "rewards/chosen": 2.7532238960266113, "rewards/margins": 2.095212697982788, "rewards/rejected": 0.658011257648468, "step": 1004 }, { "epoch": 0.22, "learning_rate": 9.82838988869412e-06, "logits/chosen": -1.2075649499893188, "logits/rejected": -1.146133303642273, "logps/chosen": -68.11157989501953, "logps/rejected": -40.235191345214844, "loss": 0.3549, "rewards/accuracies": 1.0, "rewards/chosen": 3.064112901687622, "rewards/margins": 1.017195463180542, "rewards/rejected": 2.04691743850708, "step": 1005 }, { "epoch": 0.22, "learning_rate": 9.827924035134629e-06, "logits/chosen": -1.5747402906417847, "logits/rejected": -1.64005446434021, "logps/chosen": -125.83431243896484, "logps/rejected": -146.93174743652344, "loss": 2.8428, "rewards/accuracies": 0.0, "rewards/chosen": 5.019632816314697, "rewards/margins": -5.599955081939697, "rewards/rejected": 10.619587898254395, "step": 1006 }, { "epoch": 0.22, "learning_rate": 9.827457561198507e-06, "logits/chosen": -0.958285927772522, "logits/rejected": -1.0352064371109009, "logps/chosen": -129.310302734375, "logps/rejected": -104.37322998046875, "loss": 1.2779, "rewards/accuracies": 0.0, "rewards/chosen": 6.095442295074463, "rewards/margins": -2.4677138328552246, "rewards/rejected": 8.563156127929688, "step": 1007 }, { "epoch": 0.22, "learning_rate": 9.826990466945695e-06, "logits/chosen": -1.3141765594482422, "logits/rejected": -1.1716750860214233, "logps/chosen": -115.65960693359375, "logps/rejected": -86.12639617919922, "loss": 0.1112, "rewards/accuracies": 1.0, "rewards/chosen": 6.132534980773926, "rewards/margins": 1.4192466735839844, "rewards/rejected": 4.713288307189941, "step": 1008 }, { "epoch": 0.22, "learning_rate": 9.826522752436211e-06, "logits/chosen": -1.1982951164245605, "logits/rejected": -1.2227275371551514, "logps/chosen": -55.258445739746094, "logps/rejected": -60.069557189941406, "loss": 0.8138, "rewards/accuracies": 0.0, "rewards/chosen": 3.0963616371154785, "rewards/margins": -1.2190675735473633, "rewards/rejected": 4.315429210662842, "step": 1009 }, { "epoch": 0.22, "learning_rate": 9.826054417730156e-06, "logits/chosen": -1.3417431116104126, "logits/rejected": -0.8574034571647644, "logps/chosen": -105.80457305908203, "logps/rejected": -106.49839782714844, "loss": 0.1828, "rewards/accuracies": 1.0, "rewards/chosen": 5.267758846282959, "rewards/margins": 1.2165031433105469, "rewards/rejected": 4.051255702972412, "step": 1010 }, { "epoch": 0.22, "learning_rate": 9.825585462887709e-06, "logits/chosen": -1.4576349258422852, "logits/rejected": -1.3416416645050049, "logps/chosen": -132.82583618164062, "logps/rejected": -62.30925750732422, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 6.286323547363281, "rewards/margins": 3.249185085296631, "rewards/rejected": 3.0371384620666504, "step": 1011 }, { "epoch": 0.22, "learning_rate": 9.825115887969131e-06, "logits/chosen": -1.1443334817886353, "logits/rejected": -1.2458769083023071, "logps/chosen": -65.41145324707031, "logps/rejected": -128.77447509765625, "loss": 1.8243, "rewards/accuracies": 0.0, "rewards/chosen": 2.4432549476623535, "rewards/margins": -3.6037378311157227, "rewards/rejected": 6.046992778778076, "step": 1012 }, { "epoch": 0.22, "learning_rate": 9.82464569303476e-06, "logits/chosen": -1.0852539539337158, "logits/rejected": -0.9726072549819946, "logps/chosen": -92.11015319824219, "logps/rejected": -33.90519332885742, "loss": 0.1963, "rewards/accuracies": 1.0, "rewards/chosen": 1.8589890003204346, "rewards/margins": 0.889603853225708, "rewards/rejected": 0.9693851470947266, "step": 1013 }, { "epoch": 0.22, "learning_rate": 9.824174878145017e-06, "logits/chosen": -1.3434029817581177, "logits/rejected": -1.2856314182281494, "logps/chosen": -62.43940734863281, "logps/rejected": -23.9685001373291, "loss": 0.2515, "rewards/accuracies": 1.0, "rewards/chosen": 2.6806106567382812, "rewards/margins": 1.6530717611312866, "rewards/rejected": 1.0275388956069946, "step": 1014 }, { "epoch": 0.22, "learning_rate": 9.823703443360398e-06, "logits/chosen": -1.4077848196029663, "logits/rejected": -1.3954591751098633, "logps/chosen": -70.29106140136719, "logps/rejected": -92.28309631347656, "loss": 0.5139, "rewards/accuracies": 0.0, "rewards/chosen": 2.3986527919769287, "rewards/margins": -0.2500936985015869, "rewards/rejected": 2.6487464904785156, "step": 1015 }, { "epoch": 0.22, "learning_rate": 9.823231388741483e-06, "logits/chosen": -1.3224663734436035, "logits/rejected": -1.1185232400894165, "logps/chosen": -78.17315673828125, "logps/rejected": -29.61681365966797, "loss": 1.6794, "rewards/accuracies": 1.0, "rewards/chosen": 2.5377609729766846, "rewards/margins": 2.2432820796966553, "rewards/rejected": 0.29447898268699646, "step": 1016 }, { "epoch": 0.23, "learning_rate": 9.822758714348928e-06, "logits/chosen": -1.4515130519866943, "logits/rejected": -1.417769193649292, "logps/chosen": -143.07647705078125, "logps/rejected": -135.69052124023438, "loss": 1.0807, "rewards/accuracies": 0.0, "rewards/chosen": 6.361230373382568, "rewards/margins": -1.5803956985473633, "rewards/rejected": 7.941626071929932, "step": 1017 }, { "epoch": 0.23, "learning_rate": 9.822285420243474e-06, "logits/chosen": -1.0757122039794922, "logits/rejected": -1.043130874633789, "logps/chosen": -80.44635772705078, "logps/rejected": -105.93450164794922, "loss": 0.3179, "rewards/accuracies": 1.0, "rewards/chosen": 4.577758312225342, "rewards/margins": 0.2702503204345703, "rewards/rejected": 4.3075079917907715, "step": 1018 }, { "epoch": 0.23, "learning_rate": 9.821811506485934e-06, "logits/chosen": -1.2439574003219604, "logits/rejected": -1.1610873937606812, "logps/chosen": -73.63667297363281, "logps/rejected": -49.81525802612305, "loss": 0.8909, "rewards/accuracies": 0.0, "rewards/chosen": 2.94219970703125, "rewards/margins": -0.7139599323272705, "rewards/rejected": 3.6561596393585205, "step": 1019 }, { "epoch": 0.23, "learning_rate": 9.821336973137207e-06, "logits/chosen": -1.171714186668396, "logits/rejected": -1.0709989070892334, "logps/chosen": -80.16149139404297, "logps/rejected": -34.33992385864258, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 6.125832557678223, "rewards/margins": 3.92794132232666, "rewards/rejected": 2.1978912353515625, "step": 1020 }, { "epoch": 0.23, "learning_rate": 9.820861820258269e-06, "logits/chosen": -1.3569252490997314, "logits/rejected": -0.9171228408813477, "logps/chosen": -57.157249450683594, "logps/rejected": -102.4144287109375, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 4.940690517425537, "rewards/margins": 1.2866666316986084, "rewards/rejected": 3.6540238857269287, "step": 1021 }, { "epoch": 0.23, "learning_rate": 9.820386047910177e-06, "logits/chosen": -1.3344320058822632, "logits/rejected": -1.2166551351547241, "logps/chosen": -99.37339782714844, "logps/rejected": -52.924774169921875, "loss": 0.2634, "rewards/accuracies": 1.0, "rewards/chosen": 2.939718723297119, "rewards/margins": 0.37951064109802246, "rewards/rejected": 2.5602080821990967, "step": 1022 }, { "epoch": 0.23, "learning_rate": 9.819909656154066e-06, "logits/chosen": -1.3293194770812988, "logits/rejected": -1.192933440208435, "logps/chosen": -143.04066467285156, "logps/rejected": -69.88790893554688, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": 5.966075420379639, "rewards/margins": 2.518338203430176, "rewards/rejected": 3.447737216949463, "step": 1023 }, { "epoch": 0.23, "learning_rate": 9.81943264505115e-06, "logits/chosen": -1.4466582536697388, "logits/rejected": -1.4606455564498901, "logps/chosen": -81.30967712402344, "logps/rejected": -64.24911499023438, "loss": 1.1605, "rewards/accuracies": 0.0, "rewards/chosen": 2.076803684234619, "rewards/margins": -2.2117247581481934, "rewards/rejected": 4.2885284423828125, "step": 1024 }, { "epoch": 0.23, "learning_rate": 9.818955014662725e-06, "logits/chosen": -1.2718749046325684, "logits/rejected": -1.2538560628890991, "logps/chosen": -62.199928283691406, "logps/rejected": -89.2511978149414, "loss": 0.1895, "rewards/accuracies": 1.0, "rewards/chosen": 3.533221483230591, "rewards/margins": 0.9565742015838623, "rewards/rejected": 2.5766472816467285, "step": 1025 }, { "epoch": 0.23, "learning_rate": 9.818476765050167e-06, "logits/chosen": -1.4546093940734863, "logits/rejected": -1.4546093940734863, "logps/chosen": -83.20140838623047, "logps/rejected": -83.20140838623047, "loss": 1.1301, "rewards/accuracies": 0.0, "rewards/chosen": 3.1426186561584473, "rewards/margins": 0.0, "rewards/rejected": 3.1426186561584473, "step": 1026 }, { "epoch": 0.23, "learning_rate": 9.817997896274925e-06, "logits/chosen": -1.243457317352295, "logits/rejected": -1.2788100242614746, "logps/chosen": -59.053993225097656, "logps/rejected": -73.90027618408203, "loss": 1.0058, "rewards/accuracies": 0.0, "rewards/chosen": 1.974958062171936, "rewards/margins": -1.0070534944534302, "rewards/rejected": 2.982011556625366, "step": 1027 }, { "epoch": 0.23, "learning_rate": 9.817518408398536e-06, "logits/chosen": -1.3829379081726074, "logits/rejected": -1.4668567180633545, "logps/chosen": -28.655590057373047, "logps/rejected": -132.47817993164062, "loss": 3.8688, "rewards/accuracies": 0.0, "rewards/chosen": 2.3414173126220703, "rewards/margins": -3.163944721221924, "rewards/rejected": 5.505362033843994, "step": 1028 }, { "epoch": 0.23, "learning_rate": 9.817038301482612e-06, "logits/chosen": -1.2835253477096558, "logits/rejected": -1.146525502204895, "logps/chosen": -105.21807861328125, "logps/rejected": -62.20195007324219, "loss": 0.2318, "rewards/accuracies": 1.0, "rewards/chosen": 5.252252101898193, "rewards/margins": 0.9681816101074219, "rewards/rejected": 4.2840704917907715, "step": 1029 }, { "epoch": 0.23, "learning_rate": 9.81655757558885e-06, "logits/chosen": -1.3404877185821533, "logits/rejected": -1.3016589879989624, "logps/chosen": -85.9533462524414, "logps/rejected": -214.8535919189453, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 6.777588844299316, "rewards/margins": 2.7924249172210693, "rewards/rejected": 3.985163927078247, "step": 1030 }, { "epoch": 0.23, "learning_rate": 9.816076230779014e-06, "logits/chosen": -1.253393292427063, "logits/rejected": -1.2566362619400024, "logps/chosen": -37.43889617919922, "logps/rejected": -48.96858215332031, "loss": 1.5219, "rewards/accuracies": 0.0, "rewards/chosen": 1.6902507543563843, "rewards/margins": -2.982419013977051, "rewards/rejected": 4.672669887542725, "step": 1031 }, { "epoch": 0.23, "learning_rate": 9.815594267114962e-06, "logits/chosen": -1.4111696481704712, "logits/rejected": -1.4111696481704712, "logps/chosen": -44.872276306152344, "logps/rejected": -44.872276306152344, "loss": 0.3573, "rewards/accuracies": 0.0, "rewards/chosen": 1.6827442646026611, "rewards/margins": 0.0, "rewards/rejected": 1.6827442646026611, "step": 1032 }, { "epoch": 0.23, "learning_rate": 9.815111684658622e-06, "logits/chosen": -1.1599825620651245, "logits/rejected": -1.1599825620651245, "logps/chosen": -54.519561767578125, "logps/rejected": -54.519561767578125, "loss": 0.4318, "rewards/accuracies": 0.0, "rewards/chosen": 3.3516845703125, "rewards/margins": 0.0, "rewards/rejected": 3.3516845703125, "step": 1033 }, { "epoch": 0.23, "learning_rate": 9.814628483472006e-06, "logits/chosen": -1.0702059268951416, "logits/rejected": -0.9869692921638489, "logps/chosen": -49.050315856933594, "logps/rejected": -31.05177879333496, "loss": 0.4124, "rewards/accuracies": 1.0, "rewards/chosen": 1.9351913928985596, "rewards/margins": 0.36195969581604004, "rewards/rejected": 1.5732316970825195, "step": 1034 }, { "epoch": 0.23, "learning_rate": 9.814144663617204e-06, "logits/chosen": -1.3612544536590576, "logits/rejected": -1.3842291831970215, "logps/chosen": -51.28517532348633, "logps/rejected": -42.37656021118164, "loss": 0.3801, "rewards/accuracies": 0.0, "rewards/chosen": 2.6900737285614014, "rewards/margins": -0.12733769416809082, "rewards/rejected": 2.817411422729492, "step": 1035 }, { "epoch": 0.23, "learning_rate": 9.813660225156385e-06, "logits/chosen": -1.1713104248046875, "logits/rejected": -1.1043424606323242, "logps/chosen": -114.05952453613281, "logps/rejected": -67.78871154785156, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 6.4792985916137695, "rewards/margins": 3.869163751602173, "rewards/rejected": 2.6101348400115967, "step": 1036 }, { "epoch": 0.23, "learning_rate": 9.813175168151801e-06, "logits/chosen": -1.3040884733200073, "logits/rejected": -1.2744724750518799, "logps/chosen": -53.48360061645508, "logps/rejected": -51.00994873046875, "loss": 0.9349, "rewards/accuracies": 0.0, "rewards/chosen": 2.6041600704193115, "rewards/margins": -1.6530749797821045, "rewards/rejected": 4.257235050201416, "step": 1037 }, { "epoch": 0.23, "learning_rate": 9.812689492665777e-06, "logits/chosen": -1.2703512907028198, "logits/rejected": -1.0608361959457397, "logps/chosen": -167.66226196289062, "logps/rejected": -48.62456130981445, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": 6.323977947235107, "rewards/margins": 1.47636079788208, "rewards/rejected": 4.847617149353027, "step": 1038 }, { "epoch": 0.23, "learning_rate": 9.812203198760722e-06, "logits/chosen": -1.0533130168914795, "logits/rejected": -1.0533130168914795, "logps/chosen": -22.90125274658203, "logps/rejected": -22.90125274658203, "loss": 0.6037, "rewards/accuracies": 0.0, "rewards/chosen": 2.1317391395568848, "rewards/margins": 0.0, "rewards/rejected": 2.1317391395568848, "step": 1039 }, { "epoch": 0.23, "learning_rate": 9.811716286499125e-06, "logits/chosen": -1.1629899740219116, "logits/rejected": -1.2045767307281494, "logps/chosen": -40.61684036254883, "logps/rejected": -61.50786590576172, "loss": 2.0203, "rewards/accuracies": 0.0, "rewards/chosen": 3.726501226425171, "rewards/margins": -3.782757043838501, "rewards/rejected": 7.509258270263672, "step": 1040 }, { "epoch": 0.23, "learning_rate": 9.811228755943551e-06, "logits/chosen": -1.5127571821212769, "logits/rejected": -1.5093718767166138, "logps/chosen": -54.3673095703125, "logps/rejected": -50.55043029785156, "loss": 0.403, "rewards/accuracies": 1.0, "rewards/chosen": 3.025515079498291, "rewards/margins": 0.1668229103088379, "rewards/rejected": 2.858692169189453, "step": 1041 }, { "epoch": 0.23, "learning_rate": 9.810740607156647e-06, "logits/chosen": -1.2094099521636963, "logits/rejected": -1.0656918287277222, "logps/chosen": -54.47875213623047, "logps/rejected": -52.8162956237793, "loss": 0.2234, "rewards/accuracies": 1.0, "rewards/chosen": 3.5020394325256348, "rewards/margins": 0.6099522113800049, "rewards/rejected": 2.89208722114563, "step": 1042 }, { "epoch": 0.23, "learning_rate": 9.810251840201143e-06, "logits/chosen": -1.4654942750930786, "logits/rejected": -1.4165289402008057, "logps/chosen": -106.64598083496094, "logps/rejected": -41.74728012084961, "loss": 0.582, "rewards/accuracies": 1.0, "rewards/chosen": 2.587790012359619, "rewards/margins": 0.744540810585022, "rewards/rejected": 1.8432492017745972, "step": 1043 }, { "epoch": 0.23, "learning_rate": 9.80976245513984e-06, "logits/chosen": -1.4493176937103271, "logits/rejected": -1.401231288909912, "logps/chosen": -52.647621154785156, "logps/rejected": -31.665706634521484, "loss": 1.3253, "rewards/accuracies": 1.0, "rewards/chosen": 2.7083213329315186, "rewards/margins": 0.9886584281921387, "rewards/rejected": 1.7196629047393799, "step": 1044 }, { "epoch": 0.23, "learning_rate": 9.809272452035622e-06, "logits/chosen": -1.4459961652755737, "logits/rejected": -1.4114809036254883, "logps/chosen": -54.6330451965332, "logps/rejected": -45.46782302856445, "loss": 0.8055, "rewards/accuracies": 0.0, "rewards/chosen": 1.9081432819366455, "rewards/margins": -0.6886811256408691, "rewards/rejected": 2.5968244075775146, "step": 1045 }, { "epoch": 0.23, "learning_rate": 9.808781830951457e-06, "logits/chosen": -1.0187238454818726, "logits/rejected": -0.8486161231994629, "logps/chosen": -42.68179702758789, "logps/rejected": -30.966934204101562, "loss": 0.575, "rewards/accuracies": 0.0, "rewards/chosen": 1.3168308734893799, "rewards/margins": -0.657806396484375, "rewards/rejected": 1.9746372699737549, "step": 1046 }, { "epoch": 0.23, "learning_rate": 9.808290591950386e-06, "logits/chosen": -1.2879408597946167, "logits/rejected": -1.119009256362915, "logps/chosen": -50.11856460571289, "logps/rejected": -14.509593963623047, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": 3.35058856010437, "rewards/margins": 2.8670456409454346, "rewards/rejected": 0.4835428297519684, "step": 1047 }, { "epoch": 0.23, "learning_rate": 9.807798735095533e-06, "logits/chosen": -1.501505732536316, "logits/rejected": -1.3749384880065918, "logps/chosen": -144.3510284423828, "logps/rejected": -111.00617218017578, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": 6.773961067199707, "rewards/margins": 3.063915491104126, "rewards/rejected": 3.710045576095581, "step": 1048 }, { "epoch": 0.23, "learning_rate": 9.807306260450098e-06, "logits/chosen": -1.3836849927902222, "logits/rejected": -1.3849451541900635, "logps/chosen": -46.955589294433594, "logps/rejected": -79.32820129394531, "loss": 0.8414, "rewards/accuracies": 0.0, "rewards/chosen": 2.7217071056365967, "rewards/margins": -0.6596558094024658, "rewards/rejected": 3.3813629150390625, "step": 1049 }, { "epoch": 0.23, "learning_rate": 9.806813168077367e-06, "logits/chosen": -1.2647467851638794, "logits/rejected": -1.218745231628418, "logps/chosen": -63.82318878173828, "logps/rejected": -58.6875, "loss": 1.4412, "rewards/accuracies": 0.0, "rewards/chosen": 2.7878594398498535, "rewards/margins": -2.824373722076416, "rewards/rejected": 5.6122331619262695, "step": 1050 }, { "epoch": 0.23, "learning_rate": 9.806319458040701e-06, "logits/chosen": -1.5180737972259521, "logits/rejected": -1.232232689857483, "logps/chosen": -111.58767700195312, "logps/rejected": -43.5535888671875, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 4.78976583480835, "rewards/margins": 3.29472017288208, "rewards/rejected": 1.49504554271698, "step": 1051 }, { "epoch": 0.23, "learning_rate": 9.805825130403536e-06, "logits/chosen": -1.3444057703018188, "logits/rejected": -1.3444057703018188, "logps/chosen": -61.550498962402344, "logps/rejected": -61.550498962402344, "loss": 0.3558, "rewards/accuracies": 0.0, "rewards/chosen": 1.3582619428634644, "rewards/margins": 0.0, "rewards/rejected": 1.3582619428634644, "step": 1052 }, { "epoch": 0.23, "learning_rate": 9.805330185229397e-06, "logits/chosen": -1.055993914604187, "logits/rejected": -1.1146131753921509, "logps/chosen": -54.37031555175781, "logps/rejected": -99.13372802734375, "loss": 2.2711, "rewards/accuracies": 0.0, "rewards/chosen": 2.878370761871338, "rewards/margins": -1.7938246726989746, "rewards/rejected": 4.6721954345703125, "step": 1053 }, { "epoch": 0.23, "learning_rate": 9.804834622581879e-06, "logits/chosen": -1.5182974338531494, "logits/rejected": -1.411035180091858, "logps/chosen": -58.320884704589844, "logps/rejected": -7.234003067016602, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": 3.7658486366271973, "rewards/margins": 2.406202793121338, "rewards/rejected": 1.3596458435058594, "step": 1054 }, { "epoch": 0.23, "learning_rate": 9.804338442524661e-06, "logits/chosen": -1.5649381875991821, "logits/rejected": -1.5790740251541138, "logps/chosen": -101.72013854980469, "logps/rejected": -131.4733123779297, "loss": 2.5449, "rewards/accuracies": 0.0, "rewards/chosen": 4.488264560699463, "rewards/margins": -4.864517688751221, "rewards/rejected": 9.352782249450684, "step": 1055 }, { "epoch": 0.23, "learning_rate": 9.803841645121505e-06, "logits/chosen": -1.4919531345367432, "logits/rejected": -1.5428460836410522, "logps/chosen": -42.9388427734375, "logps/rejected": -68.21398162841797, "loss": 1.3089, "rewards/accuracies": 0.0, "rewards/chosen": 2.439765214920044, "rewards/margins": -0.633760929107666, "rewards/rejected": 3.07352614402771, "step": 1056 }, { "epoch": 0.23, "learning_rate": 9.803344230436245e-06, "logits/chosen": -1.3314298391342163, "logits/rejected": -0.9989550709724426, "logps/chosen": -170.79122924804688, "logps/rejected": -11.249937057495117, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 8.203845024108887, "rewards/margins": 7.424657344818115, "rewards/rejected": 0.7791876196861267, "step": 1057 }, { "epoch": 0.23, "learning_rate": 9.802846198532798e-06, "logits/chosen": -1.2134020328521729, "logits/rejected": -1.1408499479293823, "logps/chosen": -44.050174713134766, "logps/rejected": -43.936424255371094, "loss": 0.5775, "rewards/accuracies": 0.0, "rewards/chosen": 1.8970733880996704, "rewards/margins": -0.7740861177444458, "rewards/rejected": 2.671159505844116, "step": 1058 }, { "epoch": 0.23, "learning_rate": 9.80234754947516e-06, "logits/chosen": -0.9841809272766113, "logits/rejected": -0.9217202663421631, "logps/chosen": -32.01210021972656, "logps/rejected": -39.1709098815918, "loss": 0.2942, "rewards/accuracies": 1.0, "rewards/chosen": 2.381758451461792, "rewards/margins": 0.5255954265594482, "rewards/rejected": 1.8561630249023438, "step": 1059 }, { "epoch": 0.23, "learning_rate": 9.801848283327406e-06, "logits/chosen": -1.2432308197021484, "logits/rejected": -1.2418147325515747, "logps/chosen": -66.13822937011719, "logps/rejected": -68.36527252197266, "loss": 1.117, "rewards/accuracies": 0.0, "rewards/chosen": 1.518489122390747, "rewards/margins": -2.0984396934509277, "rewards/rejected": 3.616928815841675, "step": 1060 }, { "epoch": 0.23, "learning_rate": 9.801348400153692e-06, "logits/chosen": -1.3822027444839478, "logits/rejected": -1.3949204683303833, "logps/chosen": -108.07977294921875, "logps/rejected": -168.91033935546875, "loss": 2.3342, "rewards/accuracies": 0.0, "rewards/chosen": 7.336270332336426, "rewards/margins": -2.213653564453125, "rewards/rejected": 9.54992389678955, "step": 1061 }, { "epoch": 0.24, "learning_rate": 9.800847900018251e-06, "logits/chosen": -1.0193208456039429, "logits/rejected": -1.1260638236999512, "logps/chosen": -57.303348541259766, "logps/rejected": -56.53997039794922, "loss": 2.7082, "rewards/accuracies": 0.0, "rewards/chosen": 0.4221054017543793, "rewards/margins": -5.34001350402832, "rewards/rejected": 5.762118816375732, "step": 1062 }, { "epoch": 0.24, "learning_rate": 9.800346782985395e-06, "logits/chosen": -1.2707241773605347, "logits/rejected": -1.1141817569732666, "logps/chosen": -111.08694458007812, "logps/rejected": -54.12826919555664, "loss": 0.5662, "rewards/accuracies": 1.0, "rewards/chosen": 5.78204345703125, "rewards/margins": 3.1578571796417236, "rewards/rejected": 2.6241862773895264, "step": 1063 }, { "epoch": 0.24, "learning_rate": 9.799845049119517e-06, "logits/chosen": -1.5165092945098877, "logits/rejected": -1.4531782865524292, "logps/chosen": -73.63929748535156, "logps/rejected": -84.22799682617188, "loss": 1.3433, "rewards/accuracies": 0.0, "rewards/chosen": 4.5597734451293945, "rewards/margins": -1.2646331787109375, "rewards/rejected": 5.824406623840332, "step": 1064 }, { "epoch": 0.24, "learning_rate": 9.79934269848509e-06, "logits/chosen": -1.4089226722717285, "logits/rejected": -1.4213366508483887, "logps/chosen": -52.90803527832031, "logps/rejected": -55.6746826171875, "loss": 0.5456, "rewards/accuracies": 0.0, "rewards/chosen": 2.1527321338653564, "rewards/margins": -0.6300017833709717, "rewards/rejected": 2.782733917236328, "step": 1065 }, { "epoch": 0.24, "learning_rate": 9.798839731146662e-06, "logits/chosen": -0.9355743527412415, "logits/rejected": -0.9105193018913269, "logps/chosen": -92.90870666503906, "logps/rejected": -40.65999984741211, "loss": 0.5045, "rewards/accuracies": 1.0, "rewards/chosen": 4.963310241699219, "rewards/margins": 2.0118649005889893, "rewards/rejected": 2.9514453411102295, "step": 1066 }, { "epoch": 0.24, "learning_rate": 9.798336147168865e-06, "logits/chosen": -1.5054693222045898, "logits/rejected": -1.5138981342315674, "logps/chosen": -44.917808532714844, "logps/rejected": -51.50783157348633, "loss": 2.107, "rewards/accuracies": 0.0, "rewards/chosen": 2.039614200592041, "rewards/margins": -0.923203706741333, "rewards/rejected": 2.962817907333374, "step": 1067 }, { "epoch": 0.24, "learning_rate": 9.797831946616408e-06, "logits/chosen": -1.2745658159255981, "logits/rejected": -1.2745658159255981, "logps/chosen": -47.88656234741211, "logps/rejected": -47.88656234741211, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.8267948627471924, "rewards/margins": 0.0, "rewards/rejected": 1.8267948627471924, "step": 1068 }, { "epoch": 0.24, "learning_rate": 9.797327129554081e-06, "logits/chosen": -1.2640657424926758, "logits/rejected": -1.2640657424926758, "logps/chosen": -50.789581298828125, "logps/rejected": -50.789581298828125, "loss": 0.3468, "rewards/accuracies": 0.0, "rewards/chosen": 5.564873695373535, "rewards/margins": 0.0, "rewards/rejected": 5.564873695373535, "step": 1069 }, { "epoch": 0.24, "learning_rate": 9.796821696046748e-06, "logits/chosen": -1.212841272354126, "logits/rejected": -1.212841272354126, "logps/chosen": -7.741509437561035, "logps/rejected": -7.741509437561035, "loss": 1.6462, "rewards/accuracies": 0.0, "rewards/chosen": 1.9956706762313843, "rewards/margins": 0.0, "rewards/rejected": 1.9956706762313843, "step": 1070 }, { "epoch": 0.24, "learning_rate": 9.79631564615936e-06, "logits/chosen": -1.2509684562683105, "logits/rejected": -1.1560606956481934, "logps/chosen": -123.70265197753906, "logps/rejected": -64.4921875, "loss": 0.9771, "rewards/accuracies": 1.0, "rewards/chosen": 6.0935378074646, "rewards/margins": 3.6056783199310303, "rewards/rejected": 2.4878594875335693, "step": 1071 }, { "epoch": 0.24, "learning_rate": 9.79580897995694e-06, "logits/chosen": -1.5890783071517944, "logits/rejected": -1.420986533164978, "logps/chosen": -89.2588119506836, "logps/rejected": -29.176490783691406, "loss": 0.7014, "rewards/accuracies": 1.0, "rewards/chosen": 3.092427968978882, "rewards/margins": 2.615973711013794, "rewards/rejected": 0.4764541685581207, "step": 1072 }, { "epoch": 0.24, "learning_rate": 9.795301697504595e-06, "logits/chosen": -1.3303873538970947, "logits/rejected": -1.315301775932312, "logps/chosen": -48.634910583496094, "logps/rejected": -64.02598571777344, "loss": 2.3007, "rewards/accuracies": 1.0, "rewards/chosen": 2.6120285987854004, "rewards/margins": 0.24927210807800293, "rewards/rejected": 2.3627564907073975, "step": 1073 }, { "epoch": 0.24, "learning_rate": 9.794793798867512e-06, "logits/chosen": -1.8209501504898071, "logits/rejected": -1.7026011943817139, "logps/chosen": -122.57542419433594, "logps/rejected": -70.79533386230469, "loss": 0.1488, "rewards/accuracies": 1.0, "rewards/chosen": 5.2226715087890625, "rewards/margins": 1.0652785301208496, "rewards/rejected": 4.157392978668213, "step": 1074 }, { "epoch": 0.24, "learning_rate": 9.794285284110949e-06, "logits/chosen": -1.4396276473999023, "logits/rejected": -1.3935168981552124, "logps/chosen": -76.73194122314453, "logps/rejected": -74.09150695800781, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": 6.464014530181885, "rewards/margins": 4.310751914978027, "rewards/rejected": 2.1532623767852783, "step": 1075 }, { "epoch": 0.24, "learning_rate": 9.793776153300253e-06, "logits/chosen": -1.4461897611618042, "logits/rejected": -1.4098858833312988, "logps/chosen": -59.97856903076172, "logps/rejected": -84.57408142089844, "loss": 0.2312, "rewards/accuracies": 1.0, "rewards/chosen": 2.240053653717041, "rewards/margins": 0.8827004432678223, "rewards/rejected": 1.3573532104492188, "step": 1076 }, { "epoch": 0.24, "learning_rate": 9.793266406500847e-06, "logits/chosen": -1.5898958444595337, "logits/rejected": -1.653206467628479, "logps/chosen": -132.2926483154297, "logps/rejected": -78.49783325195312, "loss": 1.9576, "rewards/accuracies": 0.0, "rewards/chosen": 4.017936706542969, "rewards/margins": -3.8769469261169434, "rewards/rejected": 7.894883632659912, "step": 1077 }, { "epoch": 0.24, "learning_rate": 9.792756043778229e-06, "logits/chosen": -1.1851215362548828, "logits/rejected": -1.0907057523727417, "logps/chosen": -82.0683364868164, "logps/rejected": -71.57781982421875, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": 6.527072906494141, "rewards/margins": 2.366809844970703, "rewards/rejected": 4.1602630615234375, "step": 1078 }, { "epoch": 0.24, "learning_rate": 9.79224506519798e-06, "logits/chosen": -1.2741769552230835, "logits/rejected": -1.2175238132476807, "logps/chosen": -89.86808776855469, "logps/rejected": -91.71720886230469, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 6.6382646560668945, "rewards/margins": 0.6286849975585938, "rewards/rejected": 6.009579658508301, "step": 1079 }, { "epoch": 0.24, "learning_rate": 9.791733470825763e-06, "logits/chosen": -1.643949031829834, "logits/rejected": -1.6749765872955322, "logps/chosen": -51.80967330932617, "logps/rejected": -153.3814697265625, "loss": 0.7092, "rewards/accuracies": 0.0, "rewards/chosen": 3.9539706707000732, "rewards/margins": -1.1118342876434326, "rewards/rejected": 5.065804958343506, "step": 1080 }, { "epoch": 0.24, "learning_rate": 9.791221260727313e-06, "logits/chosen": -1.238793969154358, "logits/rejected": -1.2777026891708374, "logps/chosen": -58.918701171875, "logps/rejected": -95.35691833496094, "loss": 0.7519, "rewards/accuracies": 0.0, "rewards/chosen": 3.6989059448242188, "rewards/margins": -1.189361572265625, "rewards/rejected": 4.888267517089844, "step": 1081 }, { "epoch": 0.24, "learning_rate": 9.790708434968448e-06, "logits/chosen": -1.2882630825042725, "logits/rejected": -1.2213047742843628, "logps/chosen": -43.764015197753906, "logps/rejected": -49.336517333984375, "loss": 2.1352, "rewards/accuracies": 0.0, "rewards/chosen": 2.0752525329589844, "rewards/margins": -0.4965989589691162, "rewards/rejected": 2.5718514919281006, "step": 1082 }, { "epoch": 0.24, "learning_rate": 9.790194993615065e-06, "logits/chosen": -1.3659547567367554, "logits/rejected": -1.2801597118377686, "logps/chosen": -114.0074462890625, "logps/rejected": -66.58320617675781, "loss": 0.6463, "rewards/accuracies": 1.0, "rewards/chosen": 3.0482513904571533, "rewards/margins": 0.6677565574645996, "rewards/rejected": 2.3804948329925537, "step": 1083 }, { "epoch": 0.24, "learning_rate": 9.78968093673314e-06, "logits/chosen": -1.321694016456604, "logits/rejected": -1.248467206954956, "logps/chosen": -154.927001953125, "logps/rejected": -142.00503540039062, "loss": 0.4564, "rewards/accuracies": 0.0, "rewards/chosen": 5.631233215332031, "rewards/margins": -0.34509754180908203, "rewards/rejected": 5.976330757141113, "step": 1084 }, { "epoch": 0.24, "learning_rate": 9.789166264388732e-06, "logits/chosen": -1.7331289052963257, "logits/rejected": -1.6694585084915161, "logps/chosen": -83.30670928955078, "logps/rejected": -56.45250701904297, "loss": 1.373, "rewards/accuracies": 0.0, "rewards/chosen": 2.9252357482910156, "rewards/margins": -4.887580871582031e-05, "rewards/rejected": 2.9252846240997314, "step": 1085 }, { "epoch": 0.24, "learning_rate": 9.78865097664797e-06, "logits/chosen": -0.8729621171951294, "logits/rejected": -1.2572203874588013, "logps/chosen": -42.88665008544922, "logps/rejected": -48.23701095581055, "loss": 0.7546, "rewards/accuracies": 0.0, "rewards/chosen": 2.3616585731506348, "rewards/margins": -0.8071787357330322, "rewards/rejected": 3.168837308883667, "step": 1086 }, { "epoch": 0.24, "learning_rate": 9.788135073577069e-06, "logits/chosen": -1.3743878602981567, "logits/rejected": -1.4160499572753906, "logps/chosen": -59.154754638671875, "logps/rejected": -98.23724365234375, "loss": 1.322, "rewards/accuracies": 0.0, "rewards/chosen": 3.4753310680389404, "rewards/margins": -2.0465915203094482, "rewards/rejected": 5.521922588348389, "step": 1087 }, { "epoch": 0.24, "learning_rate": 9.787618555242321e-06, "logits/chosen": -1.6477025747299194, "logits/rejected": -1.679486870765686, "logps/chosen": -108.10134887695312, "logps/rejected": -91.25845336914062, "loss": 0.1438, "rewards/accuracies": 1.0, "rewards/chosen": 7.5716094970703125, "rewards/margins": 2.738616943359375, "rewards/rejected": 4.8329925537109375, "step": 1088 }, { "epoch": 0.24, "learning_rate": 9.787101421710099e-06, "logits/chosen": -1.642659306526184, "logits/rejected": -1.3977118730545044, "logps/chosen": -144.60520935058594, "logps/rejected": -88.60416412353516, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": 7.062989711761475, "rewards/margins": 3.802825689315796, "rewards/rejected": 3.2601640224456787, "step": 1089 }, { "epoch": 0.24, "learning_rate": 9.786583673046851e-06, "logits/chosen": -1.4247416257858276, "logits/rejected": -1.3216381072998047, "logps/chosen": -116.96450805664062, "logps/rejected": -80.26426696777344, "loss": 0.1774, "rewards/accuracies": 1.0, "rewards/chosen": 9.121896743774414, "rewards/margins": 5.153339385986328, "rewards/rejected": 3.968557119369507, "step": 1090 }, { "epoch": 0.24, "learning_rate": 9.786065309319107e-06, "logits/chosen": -1.1576672792434692, "logits/rejected": -1.1111345291137695, "logps/chosen": -108.93428039550781, "logps/rejected": -80.37428283691406, "loss": 1.8885, "rewards/accuracies": 1.0, "rewards/chosen": 5.231106758117676, "rewards/margins": 2.0223779678344727, "rewards/rejected": 3.208728790283203, "step": 1091 }, { "epoch": 0.24, "learning_rate": 9.785546330593479e-06, "logits/chosen": -1.6454427242279053, "logits/rejected": -1.6442033052444458, "logps/chosen": -70.17076110839844, "logps/rejected": -64.0556411743164, "loss": 0.7557, "rewards/accuracies": 0.0, "rewards/chosen": 4.049421787261963, "rewards/margins": -0.8023538589477539, "rewards/rejected": 4.851775646209717, "step": 1092 }, { "epoch": 0.24, "learning_rate": 9.78502673693665e-06, "logits/chosen": -1.3122096061706543, "logits/rejected": -1.2332723140716553, "logps/chosen": -61.139339447021484, "logps/rejected": -38.928131103515625, "loss": 0.5436, "rewards/accuracies": 0.0, "rewards/chosen": 1.9722843170166016, "rewards/margins": -0.637988805770874, "rewards/rejected": 2.6102731227874756, "step": 1093 }, { "epoch": 0.24, "learning_rate": 9.784506528415388e-06, "logits/chosen": -1.3584870100021362, "logits/rejected": -1.3057541847229004, "logps/chosen": -67.4134521484375, "logps/rejected": -59.00647735595703, "loss": 0.5054, "rewards/accuracies": 0.0, "rewards/chosen": 2.208421468734741, "rewards/margins": -0.5289671421051025, "rewards/rejected": 2.7373886108398438, "step": 1094 }, { "epoch": 0.24, "learning_rate": 9.78398570509654e-06, "logits/chosen": -1.2676242589950562, "logits/rejected": -1.1840925216674805, "logps/chosen": -104.94703674316406, "logps/rejected": -77.96148681640625, "loss": 0.0745, "rewards/accuracies": 1.0, "rewards/chosen": 6.501661777496338, "rewards/margins": 2.9670989513397217, "rewards/rejected": 3.534562826156616, "step": 1095 }, { "epoch": 0.24, "learning_rate": 9.783464267047027e-06, "logits/chosen": -1.4896600246429443, "logits/rejected": -1.4032189846038818, "logps/chosen": -63.331382751464844, "logps/rejected": -43.77896499633789, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 2.1757895946502686, "rewards/margins": -0.05594372749328613, "rewards/rejected": 2.2317333221435547, "step": 1096 }, { "epoch": 0.24, "learning_rate": 9.782942214333855e-06, "logits/chosen": -1.1677972078323364, "logits/rejected": -1.2598425149917603, "logps/chosen": -47.929115295410156, "logps/rejected": -79.94563293457031, "loss": 1.2366, "rewards/accuracies": 0.0, "rewards/chosen": 1.9150924682617188, "rewards/margins": -1.0853867530822754, "rewards/rejected": 3.000479221343994, "step": 1097 }, { "epoch": 0.24, "learning_rate": 9.782419547024108e-06, "logits/chosen": -1.1034555435180664, "logits/rejected": -1.1238019466400146, "logps/chosen": -53.62928009033203, "logps/rejected": -62.503501892089844, "loss": 0.5344, "rewards/accuracies": 0.0, "rewards/chosen": 1.5811340808868408, "rewards/margins": -0.4153258800506592, "rewards/rejected": 1.9964599609375, "step": 1098 }, { "epoch": 0.24, "learning_rate": 9.781896265184944e-06, "logits/chosen": -1.2708591222763062, "logits/rejected": -1.2677251100540161, "logps/chosen": -51.164527893066406, "logps/rejected": -76.06852722167969, "loss": 0.5176, "rewards/accuracies": 0.0, "rewards/chosen": 3.5689949989318848, "rewards/margins": -0.14502477645874023, "rewards/rejected": 3.714019775390625, "step": 1099 }, { "epoch": 0.24, "learning_rate": 9.781372368883607e-06, "logits/chosen": -1.2155752182006836, "logits/rejected": -1.288421630859375, "logps/chosen": -74.78033447265625, "logps/rejected": -98.00746154785156, "loss": 1.5936, "rewards/accuracies": 0.0, "rewards/chosen": 1.9076370000839233, "rewards/margins": -3.108254909515381, "rewards/rejected": 5.015892028808594, "step": 1100 }, { "epoch": 0.24, "learning_rate": 9.780847858187414e-06, "logits/chosen": -1.407689094543457, "logits/rejected": -1.3213496208190918, "logps/chosen": -103.72503662109375, "logps/rejected": -74.63104248046875, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 4.954010009765625, "rewards/margins": 2.316946268081665, "rewards/rejected": 2.63706374168396, "step": 1101 }, { "epoch": 0.24, "learning_rate": 9.780322733163766e-06, "logits/chosen": -1.3630101680755615, "logits/rejected": -1.3876819610595703, "logps/chosen": -35.65627670288086, "logps/rejected": -35.48155212402344, "loss": 0.5371, "rewards/accuracies": 0.0, "rewards/chosen": 3.019990921020508, "rewards/margins": -0.6300098896026611, "rewards/rejected": 3.650000810623169, "step": 1102 }, { "epoch": 0.24, "learning_rate": 9.779796993880135e-06, "logits/chosen": -1.5043736696243286, "logits/rejected": -1.466377854347229, "logps/chosen": -69.86091613769531, "logps/rejected": -85.82177734375, "loss": 0.5208, "rewards/accuracies": 1.0, "rewards/chosen": 3.8364334106445312, "rewards/margins": 0.8746504783630371, "rewards/rejected": 2.961782932281494, "step": 1103 }, { "epoch": 0.24, "learning_rate": 9.779270640404082e-06, "logits/chosen": -1.3350917100906372, "logits/rejected": -1.4250621795654297, "logps/chosen": -75.9434585571289, "logps/rejected": -63.772335052490234, "loss": 2.2631, "rewards/accuracies": 0.0, "rewards/chosen": 3.57767653465271, "rewards/margins": -1.562955617904663, "rewards/rejected": 5.140632152557373, "step": 1104 }, { "epoch": 0.24, "learning_rate": 9.778743672803241e-06, "logits/chosen": -1.162317156791687, "logits/rejected": -1.111754298210144, "logps/chosen": -47.00764846801758, "logps/rejected": -54.535179138183594, "loss": 0.6674, "rewards/accuracies": 0.0, "rewards/chosen": 1.9932597875595093, "rewards/margins": -0.9891828298568726, "rewards/rejected": 2.982442617416382, "step": 1105 }, { "epoch": 0.24, "learning_rate": 9.778216091145325e-06, "logits/chosen": -1.290929913520813, "logits/rejected": -1.290929913520813, "logps/chosen": -89.74241638183594, "logps/rejected": -89.74241638183594, "loss": 2.2138, "rewards/accuracies": 0.0, "rewards/chosen": 2.76165771484375, "rewards/margins": 0.0, "rewards/rejected": 2.76165771484375, "step": 1106 }, { "epoch": 0.25, "learning_rate": 9.777687895498128e-06, "logits/chosen": -0.8954533934593201, "logits/rejected": -0.8890600800514221, "logps/chosen": -149.3596954345703, "logps/rejected": -34.06968688964844, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 8.793599128723145, "rewards/margins": 5.655697822570801, "rewards/rejected": 3.1379013061523438, "step": 1107 }, { "epoch": 0.25, "learning_rate": 9.777159085929524e-06, "logits/chosen": -1.2689400911331177, "logits/rejected": -1.2689400911331177, "logps/chosen": -21.783227920532227, "logps/rejected": -21.783227920532227, "loss": 0.388, "rewards/accuracies": 0.0, "rewards/chosen": 0.8837289810180664, "rewards/margins": 0.0, "rewards/rejected": 0.8837289810180664, "step": 1108 }, { "epoch": 0.25, "learning_rate": 9.776629662507458e-06, "logits/chosen": -1.5038141012191772, "logits/rejected": -1.3762381076812744, "logps/chosen": -158.91851806640625, "logps/rejected": -36.97753143310547, "loss": 0.2393, "rewards/accuracies": 1.0, "rewards/chosen": 5.495553493499756, "rewards/margins": 5.317622661590576, "rewards/rejected": 0.1779308319091797, "step": 1109 }, { "epoch": 0.25, "learning_rate": 9.776099625299966e-06, "logits/chosen": -1.133646011352539, "logits/rejected": -1.128467321395874, "logps/chosen": -80.00018310546875, "logps/rejected": -109.57192993164062, "loss": 2.2794, "rewards/accuracies": 0.0, "rewards/chosen": 1.9892151355743408, "rewards/margins": -1.747453212738037, "rewards/rejected": 3.736668348312378, "step": 1110 }, { "epoch": 0.25, "learning_rate": 9.775568974375151e-06, "logits/chosen": -1.2607979774475098, "logits/rejected": -1.285003900527954, "logps/chosen": -51.68574523925781, "logps/rejected": -77.25477600097656, "loss": 0.2498, "rewards/accuracies": 1.0, "rewards/chosen": 3.7528610229492188, "rewards/margins": 0.4461555480957031, "rewards/rejected": 3.3067054748535156, "step": 1111 }, { "epoch": 0.25, "learning_rate": 9.775037709801206e-06, "logits/chosen": -0.9983938932418823, "logits/rejected": -0.9983938932418823, "logps/chosen": -16.06485366821289, "logps/rejected": -16.06485366821289, "loss": 0.4044, "rewards/accuracies": 0.0, "rewards/chosen": 1.9585827589035034, "rewards/margins": 0.0, "rewards/rejected": 1.9585827589035034, "step": 1112 }, { "epoch": 0.25, "learning_rate": 9.774505831646392e-06, "logits/chosen": -0.814864993095398, "logits/rejected": -0.814864993095398, "logps/chosen": -50.5178108215332, "logps/rejected": -50.5178108215332, "loss": 0.3474, "rewards/accuracies": 0.0, "rewards/chosen": 2.2306485176086426, "rewards/margins": 0.0, "rewards/rejected": 2.2306485176086426, "step": 1113 }, { "epoch": 0.25, "learning_rate": 9.773973339979056e-06, "logits/chosen": -1.1444405317306519, "logits/rejected": -0.9972385764122009, "logps/chosen": -59.026771545410156, "logps/rejected": -12.219293594360352, "loss": 0.0744, "rewards/accuracies": 1.0, "rewards/chosen": 3.0398507118225098, "rewards/margins": 1.9048346281051636, "rewards/rejected": 1.1350160837173462, "step": 1114 }, { "epoch": 0.25, "learning_rate": 9.773440234867623e-06, "logits/chosen": -1.284166693687439, "logits/rejected": -1.2409095764160156, "logps/chosen": -79.0589828491211, "logps/rejected": -51.593528747558594, "loss": 0.1138, "rewards/accuracies": 1.0, "rewards/chosen": 5.759525299072266, "rewards/margins": 1.944671630859375, "rewards/rejected": 3.8148536682128906, "step": 1115 }, { "epoch": 0.25, "learning_rate": 9.772906516380594e-06, "logits/chosen": -1.1623400449752808, "logits/rejected": -1.0996134281158447, "logps/chosen": -50.32350158691406, "logps/rejected": -52.129615783691406, "loss": 0.2748, "rewards/accuracies": 1.0, "rewards/chosen": 1.8088852167129517, "rewards/margins": 0.33462071418762207, "rewards/rejected": 1.4742645025253296, "step": 1116 }, { "epoch": 0.25, "learning_rate": 9.772372184586551e-06, "logits/chosen": -1.7130602598190308, "logits/rejected": -1.6611227989196777, "logps/chosen": -99.41236877441406, "logps/rejected": -30.915462493896484, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": 7.583537578582764, "rewards/margins": 4.664992332458496, "rewards/rejected": 2.9185452461242676, "step": 1117 }, { "epoch": 0.25, "learning_rate": 9.771837239554156e-06, "logits/chosen": -1.269883394241333, "logits/rejected": -1.0910152196884155, "logps/chosen": -42.28430938720703, "logps/rejected": -45.96330642700195, "loss": 0.2353, "rewards/accuracies": 1.0, "rewards/chosen": 2.164134979248047, "rewards/margins": 1.6024284362792969, "rewards/rejected": 0.56170654296875, "step": 1118 }, { "epoch": 0.25, "learning_rate": 9.771301681352148e-06, "logits/chosen": -1.7093254327774048, "logits/rejected": -1.686156988143921, "logps/chosen": -73.57479858398438, "logps/rejected": -107.07476806640625, "loss": 0.9719, "rewards/accuracies": 0.0, "rewards/chosen": 3.8228867053985596, "rewards/margins": -1.7791216373443604, "rewards/rejected": 5.60200834274292, "step": 1119 }, { "epoch": 0.25, "learning_rate": 9.770765510049342e-06, "logits/chosen": -1.71126389503479, "logits/rejected": -1.553837776184082, "logps/chosen": -120.457275390625, "logps/rejected": -71.32292175292969, "loss": 0.1819, "rewards/accuracies": 1.0, "rewards/chosen": 6.528149604797363, "rewards/margins": 2.6682183742523193, "rewards/rejected": 3.859931230545044, "step": 1120 }, { "epoch": 0.25, "learning_rate": 9.770228725714637e-06, "logits/chosen": -1.478100061416626, "logits/rejected": -1.511296033859253, "logps/chosen": -67.96868896484375, "logps/rejected": -85.94938659667969, "loss": 0.4175, "rewards/accuracies": 0.0, "rewards/chosen": 5.022117614746094, "rewards/margins": -0.231414794921875, "rewards/rejected": 5.253532409667969, "step": 1121 }, { "epoch": 0.25, "learning_rate": 9.769691328417008e-06, "logits/chosen": -0.8956376910209656, "logits/rejected": -0.7887579202651978, "logps/chosen": -77.81954956054688, "logps/rejected": -99.93574523925781, "loss": 0.8767, "rewards/accuracies": 0.0, "rewards/chosen": 5.531500339508057, "rewards/margins": -0.5156707763671875, "rewards/rejected": 6.047171115875244, "step": 1122 }, { "epoch": 0.25, "learning_rate": 9.769153318225509e-06, "logits/chosen": -1.5467404127120972, "logits/rejected": -1.4860059022903442, "logps/chosen": -139.0313720703125, "logps/rejected": -40.99634552001953, "loss": 0.1143, "rewards/accuracies": 1.0, "rewards/chosen": 5.8018479347229, "rewards/margins": 2.7544777393341064, "rewards/rejected": 3.047370195388794, "step": 1123 }, { "epoch": 0.25, "learning_rate": 9.768614695209273e-06, "logits/chosen": -1.0302071571350098, "logits/rejected": -1.003653883934021, "logps/chosen": -43.315067291259766, "logps/rejected": -98.32304382324219, "loss": 0.5809, "rewards/accuracies": 0.0, "rewards/chosen": 3.406776189804077, "rewards/margins": -0.7805392742156982, "rewards/rejected": 4.187315464019775, "step": 1124 }, { "epoch": 0.25, "learning_rate": 9.768075459437513e-06, "logits/chosen": -1.236352801322937, "logits/rejected": -1.2547119855880737, "logps/chosen": -75.75463104248047, "logps/rejected": -85.95146942138672, "loss": 2.4788, "rewards/accuracies": 0.0, "rewards/chosen": 1.3435112237930298, "rewards/margins": -4.778938293457031, "rewards/rejected": 6.1224493980407715, "step": 1125 }, { "epoch": 0.25, "learning_rate": 9.76753561097952e-06, "logits/chosen": -1.2951055765151978, "logits/rejected": -1.1901845932006836, "logps/chosen": -87.03094482421875, "logps/rejected": -36.86115264892578, "loss": 0.5441, "rewards/accuracies": 0.0, "rewards/chosen": 2.215888261795044, "rewards/margins": -0.6500060558319092, "rewards/rejected": 2.865894317626953, "step": 1126 }, { "epoch": 0.25, "learning_rate": 9.766995149904658e-06, "logits/chosen": -1.2381564378738403, "logits/rejected": -1.2726409435272217, "logps/chosen": -98.97317504882812, "logps/rejected": -133.29635620117188, "loss": 0.4431, "rewards/accuracies": 0.0, "rewards/chosen": 7.426977634429932, "rewards/margins": -0.3411531448364258, "rewards/rejected": 7.768130779266357, "step": 1127 }, { "epoch": 0.25, "learning_rate": 9.766454076282382e-06, "logits/chosen": -1.2632840871810913, "logits/rejected": -0.8724533915519714, "logps/chosen": -45.32575988769531, "logps/rejected": -77.98921203613281, "loss": 1.3432, "rewards/accuracies": 0.0, "rewards/chosen": 1.9597564935684204, "rewards/margins": -1.644147515296936, "rewards/rejected": 3.6039040088653564, "step": 1128 }, { "epoch": 0.25, "learning_rate": 9.765912390182216e-06, "logits/chosen": -1.5024136304855347, "logits/rejected": -1.5227210521697998, "logps/chosen": -94.28060913085938, "logps/rejected": -111.95967864990234, "loss": 0.4397, "rewards/accuracies": 0.0, "rewards/chosen": 5.450595378875732, "rewards/margins": -0.22210454940795898, "rewards/rejected": 5.672699928283691, "step": 1129 }, { "epoch": 0.25, "learning_rate": 9.765370091673762e-06, "logits/chosen": -1.290155053138733, "logits/rejected": -1.2636511325836182, "logps/chosen": -22.930843353271484, "logps/rejected": -41.02491760253906, "loss": 2.1893, "rewards/accuracies": 0.0, "rewards/chosen": 1.674631118774414, "rewards/margins": -1.3823513984680176, "rewards/rejected": 3.0569825172424316, "step": 1130 }, { "epoch": 0.25, "learning_rate": 9.764827180826708e-06, "logits/chosen": -1.6105035543441772, "logits/rejected": -1.7040351629257202, "logps/chosen": -115.62732696533203, "logps/rejected": -209.13595581054688, "loss": 2.7759, "rewards/accuracies": 0.0, "rewards/chosen": 6.162600040435791, "rewards/margins": -5.41448450088501, "rewards/rejected": 11.5770845413208, "step": 1131 }, { "epoch": 0.25, "learning_rate": 9.764283657710815e-06, "logits/chosen": -1.3683680295944214, "logits/rejected": -1.2890000343322754, "logps/chosen": -56.296836853027344, "logps/rejected": -33.63031005859375, "loss": 0.8952, "rewards/accuracies": 0.0, "rewards/chosen": 2.9998955726623535, "rewards/margins": -0.12068867683410645, "rewards/rejected": 3.12058424949646, "step": 1132 }, { "epoch": 0.25, "learning_rate": 9.763739522395926e-06, "logits/chosen": -1.2815122604370117, "logits/rejected": -1.366597294807434, "logps/chosen": -105.0202407836914, "logps/rejected": -102.71942138671875, "loss": 2.0462, "rewards/accuracies": 0.0, "rewards/chosen": 4.592623233795166, "rewards/margins": -3.573273181915283, "rewards/rejected": 8.16589641571045, "step": 1133 }, { "epoch": 0.25, "learning_rate": 9.76319477495196e-06, "logits/chosen": -1.1714988946914673, "logits/rejected": -1.1520963907241821, "logps/chosen": -53.256797790527344, "logps/rejected": -35.89639663696289, "loss": 0.9669, "rewards/accuracies": 0.0, "rewards/chosen": 1.032507300376892, "rewards/margins": -1.7235699892044067, "rewards/rejected": 2.756077289581299, "step": 1134 }, { "epoch": 0.25, "learning_rate": 9.762649415448916e-06, "logits/chosen": -1.3071587085723877, "logits/rejected": -1.2458394765853882, "logps/chosen": -100.42426300048828, "logps/rejected": -50.06529235839844, "loss": 0.0728, "rewards/accuracies": 1.0, "rewards/chosen": 6.923290252685547, "rewards/margins": 1.9327969551086426, "rewards/rejected": 4.990493297576904, "step": 1135 }, { "epoch": 0.25, "learning_rate": 9.76210344395687e-06, "logits/chosen": -1.4985452890396118, "logits/rejected": -1.5216118097305298, "logps/chosen": -53.54167938232422, "logps/rejected": -36.04045867919922, "loss": 1.3812, "rewards/accuracies": 0.0, "rewards/chosen": 0.8266006708145142, "rewards/margins": -1.216338038444519, "rewards/rejected": 2.042938709259033, "step": 1136 }, { "epoch": 0.25, "learning_rate": 9.76155686054598e-06, "logits/chosen": -1.3972713947296143, "logits/rejected": -1.225796103477478, "logps/chosen": -85.89454650878906, "logps/rejected": -85.05606079101562, "loss": 0.7011, "rewards/accuracies": 0.0, "rewards/chosen": 5.425651550292969, "rewards/margins": -0.19978046417236328, "rewards/rejected": 5.625432014465332, "step": 1137 }, { "epoch": 0.25, "learning_rate": 9.76100966528648e-06, "logits/chosen": -1.5238008499145508, "logits/rejected": -1.4466074705123901, "logps/chosen": -36.57994079589844, "logps/rejected": -49.94916534423828, "loss": 1.5292, "rewards/accuracies": 0.0, "rewards/chosen": 2.125354528427124, "rewards/margins": -2.2666261196136475, "rewards/rejected": 4.3919806480407715, "step": 1138 }, { "epoch": 0.25, "learning_rate": 9.760461858248684e-06, "logits/chosen": -1.4857491254806519, "logits/rejected": -1.4408217668533325, "logps/chosen": -60.83626174926758, "logps/rejected": -94.19657135009766, "loss": 0.76, "rewards/accuracies": 1.0, "rewards/chosen": 7.975335597991943, "rewards/margins": 4.276602745056152, "rewards/rejected": 3.698732852935791, "step": 1139 }, { "epoch": 0.25, "learning_rate": 9.759913439502982e-06, "logits/chosen": -1.4306472539901733, "logits/rejected": -1.3488678932189941, "logps/chosen": -53.7878532409668, "logps/rejected": -33.8629150390625, "loss": 2.3437, "rewards/accuracies": 0.0, "rewards/chosen": 1.6907833814620972, "rewards/margins": -0.5200518369674683, "rewards/rejected": 2.2108352184295654, "step": 1140 }, { "epoch": 0.25, "learning_rate": 9.759364409119844e-06, "logits/chosen": -0.9805811047554016, "logits/rejected": -0.9805811047554016, "logps/chosen": -50.27971649169922, "logps/rejected": -50.27971649169922, "loss": 1.6079, "rewards/accuracies": 0.0, "rewards/chosen": 3.00300669670105, "rewards/margins": 0.0, "rewards/rejected": 3.00300669670105, "step": 1141 }, { "epoch": 0.25, "learning_rate": 9.758814767169825e-06, "logits/chosen": -1.7679163217544556, "logits/rejected": -1.7289832830429077, "logps/chosen": -66.0338134765625, "logps/rejected": -93.77471923828125, "loss": 2.0016, "rewards/accuracies": 0.0, "rewards/chosen": 2.705766439437866, "rewards/margins": -2.428016424179077, "rewards/rejected": 5.133782863616943, "step": 1142 }, { "epoch": 0.25, "learning_rate": 9.758264513723544e-06, "logits/chosen": -1.3979607820510864, "logits/rejected": -1.4946709871292114, "logps/chosen": -98.03369140625, "logps/rejected": -137.6811065673828, "loss": 4.3215, "rewards/accuracies": 0.0, "rewards/chosen": 3.598095655441284, "rewards/margins": -7.606808662414551, "rewards/rejected": 11.204904556274414, "step": 1143 }, { "epoch": 0.25, "learning_rate": 9.757713648851714e-06, "logits/chosen": -1.676811933517456, "logits/rejected": -1.6769118309020996, "logps/chosen": -110.7713394165039, "logps/rejected": -111.04405975341797, "loss": 2.5315, "rewards/accuracies": 0.0, "rewards/chosen": 6.816797733306885, "rewards/margins": -4.855647563934326, "rewards/rejected": 11.672445297241211, "step": 1144 }, { "epoch": 0.25, "learning_rate": 9.757162172625116e-06, "logits/chosen": -1.5604908466339111, "logits/rejected": -1.4469795227050781, "logps/chosen": -49.76240539550781, "logps/rejected": -48.41197204589844, "loss": 0.3284, "rewards/accuracies": 1.0, "rewards/chosen": 2.9948418140411377, "rewards/margins": 0.13154840469360352, "rewards/rejected": 2.863293409347534, "step": 1145 }, { "epoch": 0.25, "learning_rate": 9.756610085114615e-06, "logits/chosen": -1.117929458618164, "logits/rejected": -1.0071449279785156, "logps/chosen": -107.02091979980469, "logps/rejected": -59.37383270263672, "loss": 1.1594, "rewards/accuracies": 0.0, "rewards/chosen": 3.138810873031616, "rewards/margins": -2.210101842880249, "rewards/rejected": 5.348912715911865, "step": 1146 }, { "epoch": 0.25, "learning_rate": 9.756057386391154e-06, "logits/chosen": -1.7501693964004517, "logits/rejected": -1.652007818222046, "logps/chosen": -85.05867004394531, "logps/rejected": -35.81269073486328, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 7.293150424957275, "rewards/margins": 5.757490158081055, "rewards/rejected": 1.5356601476669312, "step": 1147 }, { "epoch": 0.25, "learning_rate": 9.75550407652575e-06, "logits/chosen": -1.1491219997406006, "logits/rejected": -1.1491219997406006, "logps/chosen": -74.49574279785156, "logps/rejected": -74.49574279785156, "loss": 0.5127, "rewards/accuracies": 0.0, "rewards/chosen": 3.652308702468872, "rewards/margins": 0.0, "rewards/rejected": 3.652308702468872, "step": 1148 }, { "epoch": 0.25, "learning_rate": 9.754950155589504e-06, "logits/chosen": -1.5075411796569824, "logits/rejected": -1.3725242614746094, "logps/chosen": -115.92449951171875, "logps/rejected": -86.47493743896484, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": 5.447383403778076, "rewards/margins": 2.3167803287506104, "rewards/rejected": 3.130603075027466, "step": 1149 }, { "epoch": 0.25, "learning_rate": 9.754395623653595e-06, "logits/chosen": -1.229603886604309, "logits/rejected": -1.0839804410934448, "logps/chosen": -39.00638198852539, "logps/rejected": -7.836466312408447, "loss": 1.5222, "rewards/accuracies": 1.0, "rewards/chosen": 1.6673790216445923, "rewards/margins": 0.8787249326705933, "rewards/rejected": 0.788654088973999, "step": 1150 }, { "epoch": 0.25, "learning_rate": 9.753840480789278e-06, "logits/chosen": -1.208418369293213, "logits/rejected": -1.10894775390625, "logps/chosen": -66.58245849609375, "logps/rejected": -48.996437072753906, "loss": 2.1953, "rewards/accuracies": 1.0, "rewards/chosen": 2.941610097885132, "rewards/margins": 0.35900425910949707, "rewards/rejected": 2.5826058387756348, "step": 1151 }, { "epoch": 0.25, "learning_rate": 9.753284727067886e-06, "logits/chosen": -1.183435320854187, "logits/rejected": -1.244400978088379, "logps/chosen": -23.001178741455078, "logps/rejected": -37.2421875, "loss": 1.2291, "rewards/accuracies": 0.0, "rewards/chosen": 1.7997245788574219, "rewards/margins": -1.7642464637756348, "rewards/rejected": 3.5639710426330566, "step": 1152 }, { "epoch": 0.26, "learning_rate": 9.752728362560834e-06, "logits/chosen": -1.3934130668640137, "logits/rejected": -1.299688458442688, "logps/chosen": -176.55215454101562, "logps/rejected": -131.9561767578125, "loss": 0.3352, "rewards/accuracies": 1.0, "rewards/chosen": 6.938653469085693, "rewards/margins": 0.9411649703979492, "rewards/rejected": 5.997488498687744, "step": 1153 }, { "epoch": 0.26, "learning_rate": 9.752171387339612e-06, "logits/chosen": -1.173327088356018, "logits/rejected": -1.173327088356018, "logps/chosen": -56.95843505859375, "logps/rejected": -56.95843505859375, "loss": 0.4764, "rewards/accuracies": 0.0, "rewards/chosen": 2.777087450027466, "rewards/margins": 0.0, "rewards/rejected": 2.777087450027466, "step": 1154 }, { "epoch": 0.26, "learning_rate": 9.75161380147579e-06, "logits/chosen": -1.2166602611541748, "logits/rejected": -1.217665433883667, "logps/chosen": -70.95832824707031, "logps/rejected": -54.800418853759766, "loss": 2.012, "rewards/accuracies": 0.0, "rewards/chosen": 1.7598190307617188, "rewards/margins": -3.9636454582214355, "rewards/rejected": 5.723464488983154, "step": 1155 }, { "epoch": 0.26, "learning_rate": 9.751055605041017e-06, "logits/chosen": -1.2629642486572266, "logits/rejected": -1.3242785930633545, "logps/chosen": -72.15376281738281, "logps/rejected": -114.35457611083984, "loss": 1.9341, "rewards/accuracies": 0.0, "rewards/chosen": 2.1133697032928467, "rewards/margins": -3.3076059818267822, "rewards/rejected": 5.420975685119629, "step": 1156 }, { "epoch": 0.26, "learning_rate": 9.750496798107021e-06, "logits/chosen": -1.3023251295089722, "logits/rejected": -1.3023251295089722, "logps/chosen": -91.17218017578125, "logps/rejected": -91.17218017578125, "loss": 0.3663, "rewards/accuracies": 0.0, "rewards/chosen": 4.415713787078857, "rewards/margins": 0.0, "rewards/rejected": 4.415713787078857, "step": 1157 }, { "epoch": 0.26, "learning_rate": 9.749937380745607e-06, "logits/chosen": -1.6931391954421997, "logits/rejected": -1.608811855316162, "logps/chosen": -209.10153198242188, "logps/rejected": -196.9962158203125, "loss": 2.7905, "rewards/accuracies": 0.0, "rewards/chosen": 4.105472087860107, "rewards/margins": -4.880324840545654, "rewards/rejected": 8.985796928405762, "step": 1158 }, { "epoch": 0.26, "learning_rate": 9.749377353028657e-06, "logits/chosen": -1.6241589784622192, "logits/rejected": -1.9928946495056152, "logps/chosen": -61.46515655517578, "logps/rejected": -90.71885681152344, "loss": 0.4332, "rewards/accuracies": 1.0, "rewards/chosen": 3.736325979232788, "rewards/margins": 2.5623321533203125, "rewards/rejected": 1.173993706703186, "step": 1159 }, { "epoch": 0.26, "learning_rate": 9.748816715028135e-06, "logits/chosen": -1.4122142791748047, "logits/rejected": -1.3249834775924683, "logps/chosen": -57.57655715942383, "logps/rejected": -67.88985443115234, "loss": 0.291, "rewards/accuracies": 1.0, "rewards/chosen": 0.9911556243896484, "rewards/margins": 0.24162864685058594, "rewards/rejected": 0.7495269775390625, "step": 1160 }, { "epoch": 0.26, "learning_rate": 9.748255466816081e-06, "logits/chosen": -1.2188142538070679, "logits/rejected": -0.9986188411712646, "logps/chosen": -174.16323852539062, "logps/rejected": -51.52317428588867, "loss": 0.2701, "rewards/accuracies": 1.0, "rewards/chosen": 5.654077053070068, "rewards/margins": 1.7387545108795166, "rewards/rejected": 3.9153225421905518, "step": 1161 }, { "epoch": 0.26, "learning_rate": 9.747693608464614e-06, "logits/chosen": -1.648816704750061, "logits/rejected": -1.5532883405685425, "logps/chosen": -114.29885864257812, "logps/rejected": -131.77426147460938, "loss": 2.1147, "rewards/accuracies": 0.0, "rewards/chosen": 5.192513942718506, "rewards/margins": -4.1146368980407715, "rewards/rejected": 9.307150840759277, "step": 1162 }, { "epoch": 0.26, "learning_rate": 9.74713114004593e-06, "logits/chosen": -1.416283130645752, "logits/rejected": -1.362362265586853, "logps/chosen": -41.18179702758789, "logps/rejected": -48.211082458496094, "loss": 0.3162, "rewards/accuracies": 1.0, "rewards/chosen": 2.9843595027923584, "rewards/margins": 0.13982510566711426, "rewards/rejected": 2.844534397125244, "step": 1163 }, { "epoch": 0.26, "learning_rate": 9.746568061632308e-06, "logits/chosen": -1.3474054336547852, "logits/rejected": -1.1284339427947998, "logps/chosen": -165.03314208984375, "logps/rejected": -42.292259216308594, "loss": 0.7709, "rewards/accuracies": 0.0, "rewards/chosen": 4.348791599273682, "rewards/margins": -0.3193178176879883, "rewards/rejected": 4.66810941696167, "step": 1164 }, { "epoch": 0.26, "learning_rate": 9.746004373296099e-06, "logits/chosen": -1.2282159328460693, "logits/rejected": -1.1846768856048584, "logps/chosen": -65.45777893066406, "logps/rejected": -55.99085998535156, "loss": 1.227, "rewards/accuracies": 1.0, "rewards/chosen": 2.7783889770507812, "rewards/margins": 0.21423649787902832, "rewards/rejected": 2.564152479171753, "step": 1165 }, { "epoch": 0.26, "learning_rate": 9.745440075109738e-06, "logits/chosen": -1.3200349807739258, "logits/rejected": -1.312554955482483, "logps/chosen": -52.64030456542969, "logps/rejected": -94.23048400878906, "loss": 1.3233, "rewards/accuracies": 0.0, "rewards/chosen": 1.2299911975860596, "rewards/margins": -2.5711395740509033, "rewards/rejected": 3.801130771636963, "step": 1166 }, { "epoch": 0.26, "learning_rate": 9.744875167145735e-06, "logits/chosen": -1.4331964254379272, "logits/rejected": -1.2421716451644897, "logps/chosen": -75.54939270019531, "logps/rejected": -65.662109375, "loss": 0.0938, "rewards/accuracies": 1.0, "rewards/chosen": 5.24526834487915, "rewards/margins": 3.6200919151306152, "rewards/rejected": 1.6251763105392456, "step": 1167 }, { "epoch": 0.26, "learning_rate": 9.74430964947668e-06, "logits/chosen": -1.0845916271209717, "logits/rejected": -1.0042650699615479, "logps/chosen": -80.52828979492188, "logps/rejected": -71.21542358398438, "loss": 0.5799, "rewards/accuracies": 0.0, "rewards/chosen": 1.2898834943771362, "rewards/margins": -0.6128150224685669, "rewards/rejected": 1.9026985168457031, "step": 1168 }, { "epoch": 0.26, "learning_rate": 9.74374352217524e-06, "logits/chosen": -1.5676404237747192, "logits/rejected": -1.4929612874984741, "logps/chosen": -48.77665710449219, "logps/rejected": -50.802406311035156, "loss": 0.3377, "rewards/accuracies": 1.0, "rewards/chosen": 3.735095977783203, "rewards/margins": 0.29247593879699707, "rewards/rejected": 3.442620038986206, "step": 1169 }, { "epoch": 0.26, "learning_rate": 9.743176785314159e-06, "logits/chosen": -0.914474368095398, "logits/rejected": -0.7380295991897583, "logps/chosen": -68.42416381835938, "logps/rejected": -27.807498931884766, "loss": 0.13, "rewards/accuracies": 1.0, "rewards/chosen": 1.538427710533142, "rewards/margins": 1.4282550811767578, "rewards/rejected": 0.11017265170812607, "step": 1170 }, { "epoch": 0.26, "learning_rate": 9.742609438966265e-06, "logits/chosen": -1.28695809841156, "logits/rejected": -1.28695809841156, "logps/chosen": -37.44500732421875, "logps/rejected": -37.44500732421875, "loss": 0.3578, "rewards/accuracies": 0.0, "rewards/chosen": 3.9853103160858154, "rewards/margins": 0.0, "rewards/rejected": 3.9853103160858154, "step": 1171 }, { "epoch": 0.26, "learning_rate": 9.74204148320446e-06, "logits/chosen": -1.4235211610794067, "logits/rejected": -1.3768261671066284, "logps/chosen": -83.3662338256836, "logps/rejected": -88.72976684570312, "loss": 0.3428, "rewards/accuracies": 1.0, "rewards/chosen": 4.167636871337891, "rewards/margins": 0.13566970825195312, "rewards/rejected": 4.0319671630859375, "step": 1172 }, { "epoch": 0.26, "learning_rate": 9.741472918101722e-06, "logits/chosen": -1.110672950744629, "logits/rejected": -0.9469647407531738, "logps/chosen": -69.05256652832031, "logps/rejected": -75.11428833007812, "loss": 0.2595, "rewards/accuracies": 1.0, "rewards/chosen": 3.5910308361053467, "rewards/margins": 0.5239362716674805, "rewards/rejected": 3.067094564437866, "step": 1173 }, { "epoch": 0.26, "learning_rate": 9.740903743731113e-06, "logits/chosen": -1.166370153427124, "logits/rejected": -1.2533187866210938, "logps/chosen": -29.158794403076172, "logps/rejected": -108.9847640991211, "loss": 1.329, "rewards/accuracies": 0.0, "rewards/chosen": 2.220708131790161, "rewards/margins": -1.560204029083252, "rewards/rejected": 3.780912160873413, "step": 1174 }, { "epoch": 0.26, "learning_rate": 9.74033396016577e-06, "logits/chosen": -1.465609073638916, "logits/rejected": -1.4002293348312378, "logps/chosen": -112.89090728759766, "logps/rejected": -90.1206283569336, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 8.069716453552246, "rewards/margins": 1.7352900505065918, "rewards/rejected": 6.334426403045654, "step": 1175 }, { "epoch": 0.26, "learning_rate": 9.739763567478908e-06, "logits/chosen": -1.356035590171814, "logits/rejected": -1.3140116930007935, "logps/chosen": -149.53762817382812, "logps/rejected": -107.2430419921875, "loss": 0.7051, "rewards/accuracies": 0.0, "rewards/chosen": 4.332553386688232, "rewards/margins": -1.1183805465698242, "rewards/rejected": 5.450933933258057, "step": 1176 }, { "epoch": 0.26, "learning_rate": 9.739192565743822e-06, "logits/chosen": -1.504873514175415, "logits/rejected": -1.446675181388855, "logps/chosen": -128.0897216796875, "logps/rejected": -82.53201293945312, "loss": 0.0875, "rewards/accuracies": 1.0, "rewards/chosen": 7.062512397766113, "rewards/margins": 1.6607513427734375, "rewards/rejected": 5.401761054992676, "step": 1177 }, { "epoch": 0.26, "learning_rate": 9.738620955033883e-06, "logits/chosen": -1.6145998239517212, "logits/rejected": -1.598875641822815, "logps/chosen": -96.32939910888672, "logps/rejected": -124.08699798583984, "loss": 0.9671, "rewards/accuracies": 0.0, "rewards/chosen": 2.6339166164398193, "rewards/margins": -1.3520889282226562, "rewards/rejected": 3.9860055446624756, "step": 1178 }, { "epoch": 0.26, "learning_rate": 9.738048735422545e-06, "logits/chosen": -1.5392441749572754, "logits/rejected": -1.4179357290267944, "logps/chosen": -125.81085205078125, "logps/rejected": -134.56263732910156, "loss": 0.7629, "rewards/accuracies": 1.0, "rewards/chosen": 7.583111763000488, "rewards/margins": 0.47844696044921875, "rewards/rejected": 7.1046648025512695, "step": 1179 }, { "epoch": 0.26, "learning_rate": 9.737475906983333e-06, "logits/chosen": -1.184558391571045, "logits/rejected": -1.127603530883789, "logps/chosen": -76.04866790771484, "logps/rejected": -57.14141845703125, "loss": 0.9274, "rewards/accuracies": 0.0, "rewards/chosen": 2.1254920959472656, "rewards/margins": -0.2835197448730469, "rewards/rejected": 2.4090118408203125, "step": 1180 }, { "epoch": 0.26, "learning_rate": 9.736902469789855e-06, "logits/chosen": -1.1785374879837036, "logits/rejected": -1.2243969440460205, "logps/chosen": -60.229034423828125, "logps/rejected": -112.51266479492188, "loss": 1.863, "rewards/accuracies": 0.0, "rewards/chosen": 2.294811964035034, "rewards/margins": -3.52018141746521, "rewards/rejected": 5.814993381500244, "step": 1181 }, { "epoch": 0.26, "learning_rate": 9.736328423915797e-06, "logits/chosen": -1.0177010297775269, "logits/rejected": -1.0101038217544556, "logps/chosen": -56.32085037231445, "logps/rejected": -29.201576232910156, "loss": 0.6512, "rewards/accuracies": 0.0, "rewards/chosen": 2.00718355178833, "rewards/margins": -0.6988575458526611, "rewards/rejected": 2.706041097640991, "step": 1182 }, { "epoch": 0.26, "learning_rate": 9.735753769434923e-06, "logits/chosen": -1.4008433818817139, "logits/rejected": -1.333071231842041, "logps/chosen": -119.75323486328125, "logps/rejected": -100.4727783203125, "loss": 0.186, "rewards/accuracies": 1.0, "rewards/chosen": 6.147584438323975, "rewards/margins": 0.8180127143859863, "rewards/rejected": 5.329571723937988, "step": 1183 }, { "epoch": 0.26, "learning_rate": 9.735178506421075e-06, "logits/chosen": -1.2871637344360352, "logits/rejected": -1.1141160726547241, "logps/chosen": -82.29135131835938, "logps/rejected": -44.951595306396484, "loss": 0.1893, "rewards/accuracies": 1.0, "rewards/chosen": 4.894253730773926, "rewards/margins": 3.6698625087738037, "rewards/rejected": 1.224391222000122, "step": 1184 }, { "epoch": 0.26, "learning_rate": 9.73460263494817e-06, "logits/chosen": -1.5845946073532104, "logits/rejected": -1.5370709896087646, "logps/chosen": -61.1117057800293, "logps/rejected": -74.99966430664062, "loss": 0.0873, "rewards/accuracies": 1.0, "rewards/chosen": 3.8680484294891357, "rewards/margins": 1.8122050762176514, "rewards/rejected": 2.0558433532714844, "step": 1185 }, { "epoch": 0.26, "learning_rate": 9.734026155090208e-06, "logits/chosen": -1.2664862871170044, "logits/rejected": -1.2169002294540405, "logps/chosen": -85.27903747558594, "logps/rejected": -101.87190246582031, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 7.1905107498168945, "rewards/margins": 4.632246971130371, "rewards/rejected": 2.5582635402679443, "step": 1186 }, { "epoch": 0.26, "learning_rate": 9.733449066921268e-06, "logits/chosen": -0.711774468421936, "logits/rejected": -0.711774468421936, "logps/chosen": -6.772879123687744, "logps/rejected": -6.772879123687744, "loss": 1.0599, "rewards/accuracies": 0.0, "rewards/chosen": 0.3223028779029846, "rewards/margins": 0.0, "rewards/rejected": 0.3223028779029846, "step": 1187 }, { "epoch": 0.26, "learning_rate": 9.7328713705155e-06, "logits/chosen": -1.3964123725891113, "logits/rejected": -1.2240424156188965, "logps/chosen": -46.77799606323242, "logps/rejected": -24.845611572265625, "loss": 0.1457, "rewards/accuracies": 1.0, "rewards/chosen": 1.8848956823349, "rewards/margins": 1.5794682502746582, "rewards/rejected": 0.3054273724555969, "step": 1188 }, { "epoch": 0.26, "learning_rate": 9.732293065947138e-06, "logits/chosen": -1.1079113483428955, "logits/rejected": -0.9923588633537292, "logps/chosen": -37.82756805419922, "logps/rejected": -29.18418312072754, "loss": 0.3286, "rewards/accuracies": 1.0, "rewards/chosen": 2.2121188640594482, "rewards/margins": 0.4880948066711426, "rewards/rejected": 1.7240240573883057, "step": 1189 }, { "epoch": 0.26, "learning_rate": 9.731714153290492e-06, "logits/chosen": -1.750024676322937, "logits/rejected": -1.5296554565429688, "logps/chosen": -89.08878326416016, "logps/rejected": -69.33038330078125, "loss": 0.2045, "rewards/accuracies": 1.0, "rewards/chosen": 7.054229259490967, "rewards/margins": 3.9361581802368164, "rewards/rejected": 3.1180710792541504, "step": 1190 }, { "epoch": 0.26, "learning_rate": 9.731134632619954e-06, "logits/chosen": -1.5336722135543823, "logits/rejected": -1.4692788124084473, "logps/chosen": -61.52031707763672, "logps/rejected": -63.18076705932617, "loss": 1.7344, "rewards/accuracies": 0.0, "rewards/chosen": 2.6165223121643066, "rewards/margins": -0.2526552677154541, "rewards/rejected": 2.8691775798797607, "step": 1191 }, { "epoch": 0.26, "learning_rate": 9.73055450400999e-06, "logits/chosen": -1.1602635383605957, "logits/rejected": -1.0920701026916504, "logps/chosen": -52.86827087402344, "logps/rejected": -53.415618896484375, "loss": 0.3606, "rewards/accuracies": 1.0, "rewards/chosen": 1.098244547843933, "rewards/margins": 0.9989349842071533, "rewards/rejected": 0.099309541285038, "step": 1192 }, { "epoch": 0.26, "learning_rate": 9.729973767535142e-06, "logits/chosen": -1.354645848274231, "logits/rejected": -1.1942076683044434, "logps/chosen": -103.53547668457031, "logps/rejected": -64.84837341308594, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 8.39506721496582, "rewards/margins": 4.005756855010986, "rewards/rejected": 4.389310359954834, "step": 1193 }, { "epoch": 0.26, "learning_rate": 9.729392423270036e-06, "logits/chosen": -1.4209694862365723, "logits/rejected": -1.3452218770980835, "logps/chosen": -67.50689697265625, "logps/rejected": -45.525917053222656, "loss": 1.0345, "rewards/accuracies": 0.0, "rewards/chosen": 2.214590549468994, "rewards/margins": -1.7223913669586182, "rewards/rejected": 3.9369819164276123, "step": 1194 }, { "epoch": 0.26, "learning_rate": 9.728810471289374e-06, "logits/chosen": -1.1793794631958008, "logits/rejected": -1.1261595487594604, "logps/chosen": -86.0357894897461, "logps/rejected": -94.56233978271484, "loss": 2.3424, "rewards/accuracies": 0.0, "rewards/chosen": 3.557910203933716, "rewards/margins": -4.185797691345215, "rewards/rejected": 7.74370813369751, "step": 1195 }, { "epoch": 0.26, "learning_rate": 9.728227911667934e-06, "logits/chosen": -1.0692362785339355, "logits/rejected": -1.0949745178222656, "logps/chosen": -58.01915740966797, "logps/rejected": -73.45060729980469, "loss": 1.1908, "rewards/accuracies": 0.0, "rewards/chosen": 2.0246071815490723, "rewards/margins": -1.7245681285858154, "rewards/rejected": 3.7491753101348877, "step": 1196 }, { "epoch": 0.26, "learning_rate": 9.727644744480571e-06, "logits/chosen": -1.4558024406433105, "logits/rejected": -1.3870763778686523, "logps/chosen": -139.5001220703125, "logps/rejected": -91.68717956542969, "loss": 0.4799, "rewards/accuracies": 0.0, "rewards/chosen": 4.729840278625488, "rewards/margins": -0.27505922317504883, "rewards/rejected": 5.004899501800537, "step": 1197 }, { "epoch": 0.27, "learning_rate": 9.727060969802226e-06, "logits/chosen": -1.2033214569091797, "logits/rejected": -1.114527940750122, "logps/chosen": -90.66094207763672, "logps/rejected": -39.716331481933594, "loss": 0.2485, "rewards/accuracies": 1.0, "rewards/chosen": 3.826988935470581, "rewards/margins": 3.082378387451172, "rewards/rejected": 0.744610607624054, "step": 1198 }, { "epoch": 0.27, "learning_rate": 9.726476587707908e-06, "logits/chosen": -1.4742604494094849, "logits/rejected": -1.4294902086257935, "logps/chosen": -57.43560028076172, "logps/rejected": -48.47393035888672, "loss": 0.5559, "rewards/accuracies": 0.0, "rewards/chosen": 2.694326877593994, "rewards/margins": -0.48163676261901855, "rewards/rejected": 3.1759636402130127, "step": 1199 }, { "epoch": 0.27, "learning_rate": 9.725891598272711e-06, "logits/chosen": -1.2146199941635132, "logits/rejected": -1.2232178449630737, "logps/chosen": -23.736604690551758, "logps/rejected": -89.34075927734375, "loss": 2.9703, "rewards/accuracies": 0.0, "rewards/chosen": 0.40386486053466797, "rewards/margins": -5.305459976196289, "rewards/rejected": 5.709324836730957, "step": 1200 }, { "epoch": 0.27, "learning_rate": 9.725306001571806e-06, "logits/chosen": -1.339630365371704, "logits/rejected": -1.2263139486312866, "logps/chosen": -46.00883483886719, "logps/rejected": -48.87077713012695, "loss": 1.4771, "rewards/accuracies": 1.0, "rewards/chosen": 3.680382490158081, "rewards/margins": 1.120285987854004, "rewards/rejected": 2.560096502304077, "step": 1201 }, { "epoch": 0.27, "learning_rate": 9.72471979768044e-06, "logits/chosen": -1.2688586711883545, "logits/rejected": -1.1248356103897095, "logps/chosen": -86.62728118896484, "logps/rejected": -68.8204345703125, "loss": 0.8299, "rewards/accuracies": 1.0, "rewards/chosen": 4.951179027557373, "rewards/margins": 0.8239588737487793, "rewards/rejected": 4.127220153808594, "step": 1202 }, { "epoch": 0.27, "learning_rate": 9.724132986673935e-06, "logits/chosen": -1.4389939308166504, "logits/rejected": -1.4430118799209595, "logps/chosen": -132.69252014160156, "logps/rejected": -79.38922119140625, "loss": 1.3776, "rewards/accuracies": 1.0, "rewards/chosen": 8.9754056930542, "rewards/margins": 2.0539183616638184, "rewards/rejected": 6.921487331390381, "step": 1203 }, { "epoch": 0.27, "learning_rate": 9.723545568627699e-06, "logits/chosen": -1.4808268547058105, "logits/rejected": -1.433545470237732, "logps/chosen": -57.437259674072266, "logps/rejected": -29.68946647644043, "loss": 0.2062, "rewards/accuracies": 1.0, "rewards/chosen": 3.0700619220733643, "rewards/margins": 0.9806210994720459, "rewards/rejected": 2.0894408226013184, "step": 1204 }, { "epoch": 0.27, "learning_rate": 9.722957543617211e-06, "logits/chosen": -1.3307337760925293, "logits/rejected": -1.3307337760925293, "logps/chosen": -27.665203094482422, "logps/rejected": -27.665203094482422, "loss": 0.3516, "rewards/accuracies": 0.0, "rewards/chosen": 2.7636234760284424, "rewards/margins": 0.0, "rewards/rejected": 2.7636234760284424, "step": 1205 }, { "epoch": 0.27, "learning_rate": 9.722368911718034e-06, "logits/chosen": -1.2842720746994019, "logits/rejected": -1.3063019514083862, "logps/chosen": -29.807483673095703, "logps/rejected": -84.62005615234375, "loss": 1.6855, "rewards/accuracies": 0.0, "rewards/chosen": 2.0636959075927734, "rewards/margins": -2.8727526664733887, "rewards/rejected": 4.936448574066162, "step": 1206 }, { "epoch": 0.27, "learning_rate": 9.721779673005805e-06, "logits/chosen": -1.5468647480010986, "logits/rejected": -1.3299366235733032, "logps/chosen": -114.9288101196289, "logps/rejected": -43.149658203125, "loss": 0.1432, "rewards/accuracies": 1.0, "rewards/chosen": 8.599806785583496, "rewards/margins": 6.369651794433594, "rewards/rejected": 2.2301552295684814, "step": 1207 }, { "epoch": 0.27, "learning_rate": 9.721189827556237e-06, "logits/chosen": -1.3535256385803223, "logits/rejected": -1.0164607763290405, "logps/chosen": -72.84026336669922, "logps/rejected": -89.51519775390625, "loss": 0.8204, "rewards/accuracies": 1.0, "rewards/chosen": 2.7476792335510254, "rewards/margins": 0.03477931022644043, "rewards/rejected": 2.712899923324585, "step": 1208 }, { "epoch": 0.27, "learning_rate": 9.720599375445125e-06, "logits/chosen": -1.5051361322402954, "logits/rejected": -1.3606512546539307, "logps/chosen": -135.64259338378906, "logps/rejected": -82.79161071777344, "loss": 0.5306, "rewards/accuracies": 1.0, "rewards/chosen": 6.677852153778076, "rewards/margins": 1.54341459274292, "rewards/rejected": 5.134437561035156, "step": 1209 }, { "epoch": 0.27, "learning_rate": 9.720008316748344e-06, "logits/chosen": -1.2619503736495972, "logits/rejected": -1.2167751789093018, "logps/chosen": -95.1552963256836, "logps/rejected": -79.9626235961914, "loss": 1.3068, "rewards/accuracies": 0.0, "rewards/chosen": 2.0759620666503906, "rewards/margins": -1.2985100746154785, "rewards/rejected": 3.374472141265869, "step": 1210 }, { "epoch": 0.27, "learning_rate": 9.719416651541839e-06, "logits/chosen": -1.1824811697006226, "logits/rejected": -1.2159428596496582, "logps/chosen": -41.12518310546875, "logps/rejected": -52.23484420776367, "loss": 0.6467, "rewards/accuracies": 0.0, "rewards/chosen": 1.5918315649032593, "rewards/margins": -0.7783364057540894, "rewards/rejected": 2.3701679706573486, "step": 1211 }, { "epoch": 0.27, "learning_rate": 9.718824379901639e-06, "logits/chosen": -1.143794059753418, "logits/rejected": -1.0504536628723145, "logps/chosen": -32.142433166503906, "logps/rejected": -36.52981948852539, "loss": 0.3461, "rewards/accuracies": 1.0, "rewards/chosen": 1.4790546894073486, "rewards/margins": 0.00665593147277832, "rewards/rejected": 1.4723987579345703, "step": 1212 }, { "epoch": 0.27, "learning_rate": 9.718231501903851e-06, "logits/chosen": -1.2415491342544556, "logits/rejected": -1.1590235233306885, "logps/chosen": -86.83702850341797, "logps/rejected": -89.79378509521484, "loss": 0.3937, "rewards/accuracies": 0.0, "rewards/chosen": 6.1749186515808105, "rewards/margins": -0.17590761184692383, "rewards/rejected": 6.350826263427734, "step": 1213 }, { "epoch": 0.27, "learning_rate": 9.717638017624657e-06, "logits/chosen": -1.3640756607055664, "logits/rejected": -1.2961560487747192, "logps/chosen": -64.51042175292969, "logps/rejected": -64.72494506835938, "loss": 0.0666, "rewards/accuracies": 1.0, "rewards/chosen": 5.204808235168457, "rewards/margins": 4.055889129638672, "rewards/rejected": 1.1489189863204956, "step": 1214 }, { "epoch": 0.27, "learning_rate": 9.717043927140319e-06, "logits/chosen": -1.269800066947937, "logits/rejected": -1.151658535003662, "logps/chosen": -77.95687866210938, "logps/rejected": -87.94097137451172, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 4.504176616668701, "rewards/margins": 2.4199297428131104, "rewards/rejected": 2.084246873855591, "step": 1215 }, { "epoch": 0.27, "learning_rate": 9.716449230527175e-06, "logits/chosen": -1.7338908910751343, "logits/rejected": -1.775252342224121, "logps/chosen": -65.51769256591797, "logps/rejected": -135.09556579589844, "loss": 0.284, "rewards/accuracies": 1.0, "rewards/chosen": 5.320525646209717, "rewards/margins": 0.47599124908447266, "rewards/rejected": 4.844534397125244, "step": 1216 }, { "epoch": 0.27, "learning_rate": 9.715853927861643e-06, "logits/chosen": -1.5499637126922607, "logits/rejected": -1.4759348630905151, "logps/chosen": -44.049049377441406, "logps/rejected": -47.44184875488281, "loss": 0.6425, "rewards/accuracies": 1.0, "rewards/chosen": 2.7624619007110596, "rewards/margins": 0.35680532455444336, "rewards/rejected": 2.405656576156616, "step": 1217 }, { "epoch": 0.27, "learning_rate": 9.71525801922022e-06, "logits/chosen": -1.4837151765823364, "logits/rejected": -1.4992352724075317, "logps/chosen": -82.02607727050781, "logps/rejected": -97.45464324951172, "loss": 1.2901, "rewards/accuracies": 0.0, "rewards/chosen": 2.3296830654144287, "rewards/margins": -2.3343985080718994, "rewards/rejected": 4.664081573486328, "step": 1218 }, { "epoch": 0.27, "learning_rate": 9.714661504679474e-06, "logits/chosen": -1.7528818845748901, "logits/rejected": -1.647217035293579, "logps/chosen": -138.44094848632812, "logps/rejected": -83.86595153808594, "loss": 0.2386, "rewards/accuracies": 1.0, "rewards/chosen": 5.931097507476807, "rewards/margins": 0.49991607666015625, "rewards/rejected": 5.43118143081665, "step": 1219 }, { "epoch": 0.27, "learning_rate": 9.71406438431606e-06, "logits/chosen": -1.2724173069000244, "logits/rejected": -1.2838670015335083, "logps/chosen": -130.378173828125, "logps/rejected": -188.21072387695312, "loss": 1.4434, "rewards/accuracies": 0.0, "rewards/chosen": 9.021636962890625, "rewards/margins": -2.8246307373046875, "rewards/rejected": 11.846267700195312, "step": 1220 }, { "epoch": 0.27, "learning_rate": 9.713466658206703e-06, "logits/chosen": -1.5946335792541504, "logits/rejected": -1.5989916324615479, "logps/chosen": -35.76039505004883, "logps/rejected": -59.45429229736328, "loss": 0.5286, "rewards/accuracies": 0.0, "rewards/chosen": 3.648157835006714, "rewards/margins": -0.03630566596984863, "rewards/rejected": 3.6844635009765625, "step": 1221 }, { "epoch": 0.27, "learning_rate": 9.712868326428213e-06, "logits/chosen": -1.1949539184570312, "logits/rejected": -1.1842621564865112, "logps/chosen": -36.550132751464844, "logps/rejected": -79.91209411621094, "loss": 0.585, "rewards/accuracies": 0.0, "rewards/chosen": 2.198305606842041, "rewards/margins": -0.3162193298339844, "rewards/rejected": 2.5145249366760254, "step": 1222 }, { "epoch": 0.27, "learning_rate": 9.712269389057471e-06, "logits/chosen": -1.417820692062378, "logits/rejected": -1.4603073596954346, "logps/chosen": -48.117488861083984, "logps/rejected": -47.04847717285156, "loss": 0.3554, "rewards/accuracies": 0.0, "rewards/chosen": 2.960402250289917, "rewards/margins": -0.018688678741455078, "rewards/rejected": 2.979090929031372, "step": 1223 }, { "epoch": 0.27, "learning_rate": 9.711669846171443e-06, "logits/chosen": -1.113512635231018, "logits/rejected": -1.152794599533081, "logps/chosen": -63.591796875, "logps/rejected": -52.51358413696289, "loss": 0.2359, "rewards/accuracies": 1.0, "rewards/chosen": 2.0396554470062256, "rewards/margins": 0.5091397762298584, "rewards/rejected": 1.5305156707763672, "step": 1224 }, { "epoch": 0.27, "learning_rate": 9.711069697847165e-06, "logits/chosen": -1.7195119857788086, "logits/rejected": -1.6809097528457642, "logps/chosen": -61.205726623535156, "logps/rejected": -62.78303527832031, "loss": 0.93, "rewards/accuracies": 0.0, "rewards/chosen": 2.382502794265747, "rewards/margins": -1.172452449798584, "rewards/rejected": 3.554955244064331, "step": 1225 }, { "epoch": 0.27, "learning_rate": 9.710468944161755e-06, "logits/chosen": -1.5197252035140991, "logits/rejected": -1.4412075281143188, "logps/chosen": -127.19863891601562, "logps/rejected": -132.28717041015625, "loss": 0.6006, "rewards/accuracies": 1.0, "rewards/chosen": 4.915133953094482, "rewards/margins": 1.5969913005828857, "rewards/rejected": 3.3181426525115967, "step": 1226 }, { "epoch": 0.27, "learning_rate": 9.70986758519241e-06, "logits/chosen": -1.7944566011428833, "logits/rejected": -1.6927939653396606, "logps/chosen": -82.27596282958984, "logps/rejected": -24.570117950439453, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": 5.817505836486816, "rewards/margins": 6.029143810272217, "rewards/rejected": -0.21163788437843323, "step": 1227 }, { "epoch": 0.27, "learning_rate": 9.709265621016401e-06, "logits/chosen": -1.3690567016601562, "logits/rejected": -1.335616111755371, "logps/chosen": -46.628868103027344, "logps/rejected": -77.52220916748047, "loss": 1.6292, "rewards/accuracies": 0.0, "rewards/chosen": 2.9785232543945312, "rewards/margins": -1.371901035308838, "rewards/rejected": 4.350424289703369, "step": 1228 }, { "epoch": 0.27, "learning_rate": 9.708663051711083e-06, "logits/chosen": -1.4625566005706787, "logits/rejected": -1.532847285270691, "logps/chosen": -54.8431282043457, "logps/rejected": -45.781368255615234, "loss": 1.3641, "rewards/accuracies": 0.0, "rewards/chosen": 1.8042598962783813, "rewards/margins": -2.2861948013305664, "rewards/rejected": 4.090454578399658, "step": 1229 }, { "epoch": 0.27, "learning_rate": 9.708059877353881e-06, "logits/chosen": -1.382625699043274, "logits/rejected": -1.1550575494766235, "logps/chosen": -137.36480712890625, "logps/rejected": -76.5150146484375, "loss": 0.5533, "rewards/accuracies": 0.0, "rewards/chosen": 4.290820598602295, "rewards/margins": -0.7046852111816406, "rewards/rejected": 4.9955058097839355, "step": 1230 }, { "epoch": 0.27, "learning_rate": 9.707456098022303e-06, "logits/chosen": -1.3764712810516357, "logits/rejected": -1.2915101051330566, "logps/chosen": -99.55565643310547, "logps/rejected": -46.76585006713867, "loss": 0.0976, "rewards/accuracies": 1.0, "rewards/chosen": 5.2103495597839355, "rewards/margins": 1.8100049495697021, "rewards/rejected": 3.4003446102142334, "step": 1231 }, { "epoch": 0.27, "learning_rate": 9.706851713793932e-06, "logits/chosen": -1.445975422859192, "logits/rejected": -1.4423097372055054, "logps/chosen": -73.2621841430664, "logps/rejected": -66.67337036132812, "loss": 0.543, "rewards/accuracies": 1.0, "rewards/chosen": 2.9095618724823, "rewards/margins": 1.1572867631912231, "rewards/rejected": 1.7522751092910767, "step": 1232 }, { "epoch": 0.27, "learning_rate": 9.706246724746433e-06, "logits/chosen": -1.125623106956482, "logits/rejected": -1.150367259979248, "logps/chosen": -94.90770721435547, "logps/rejected": -134.87432861328125, "loss": 2.2885, "rewards/accuracies": 0.0, "rewards/chosen": 4.960541725158691, "rewards/margins": -2.2901864051818848, "rewards/rejected": 7.250728130340576, "step": 1233 }, { "epoch": 0.27, "learning_rate": 9.705641130957541e-06, "logits/chosen": -1.4798730611801147, "logits/rejected": -1.4239258766174316, "logps/chosen": -65.09017181396484, "logps/rejected": -64.60823059082031, "loss": 0.5995, "rewards/accuracies": 0.0, "rewards/chosen": 2.7263519763946533, "rewards/margins": -0.295259952545166, "rewards/rejected": 3.0216119289398193, "step": 1234 }, { "epoch": 0.27, "learning_rate": 9.705034932505076e-06, "logits/chosen": -1.3205002546310425, "logits/rejected": -1.3021787405014038, "logps/chosen": -34.2781982421875, "logps/rejected": -95.36508178710938, "loss": 0.5882, "rewards/accuracies": 0.0, "rewards/chosen": 2.8493027687072754, "rewards/margins": -0.7543303966522217, "rewards/rejected": 3.603633165359497, "step": 1235 }, { "epoch": 0.27, "learning_rate": 9.704428129466934e-06, "logits/chosen": -1.3955389261245728, "logits/rejected": -1.3059190511703491, "logps/chosen": -59.846839904785156, "logps/rejected": -31.394271850585938, "loss": 1.3138, "rewards/accuracies": 1.0, "rewards/chosen": 4.642664432525635, "rewards/margins": 0.2604546546936035, "rewards/rejected": 4.382209777832031, "step": 1236 }, { "epoch": 0.27, "learning_rate": 9.703820721921085e-06, "logits/chosen": -1.298139214515686, "logits/rejected": -1.2767106294631958, "logps/chosen": -40.98277282714844, "logps/rejected": -60.76891326904297, "loss": 1.5024, "rewards/accuracies": 0.0, "rewards/chosen": 1.7648380994796753, "rewards/margins": -0.8497203588485718, "rewards/rejected": 2.614558458328247, "step": 1237 }, { "epoch": 0.27, "learning_rate": 9.703212709945583e-06, "logits/chosen": -1.4896076917648315, "logits/rejected": -1.3272186517715454, "logps/chosen": -88.85736083984375, "logps/rejected": -16.99661636352539, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 5.01485013961792, "rewards/margins": 4.723071575164795, "rewards/rejected": 0.291778564453125, "step": 1238 }, { "epoch": 0.27, "learning_rate": 9.70260409361855e-06, "logits/chosen": -1.5546613931655884, "logits/rejected": -1.5694812536239624, "logps/chosen": -51.63496398925781, "logps/rejected": -55.60002136230469, "loss": 0.5455, "rewards/accuracies": 1.0, "rewards/chosen": 3.2212188243865967, "rewards/margins": 1.6651145219802856, "rewards/rejected": 1.556104302406311, "step": 1239 }, { "epoch": 0.27, "learning_rate": 9.701994873018198e-06, "logits/chosen": -1.0894862413406372, "logits/rejected": -1.183459997177124, "logps/chosen": -72.55825805664062, "logps/rejected": -87.05374145507812, "loss": 1.9851, "rewards/accuracies": 0.0, "rewards/chosen": 2.1706223487854004, "rewards/margins": -3.91644287109375, "rewards/rejected": 6.08706521987915, "step": 1240 }, { "epoch": 0.27, "learning_rate": 9.70138504822281e-06, "logits/chosen": -1.4530913829803467, "logits/rejected": -1.4585835933685303, "logps/chosen": -152.3395233154297, "logps/rejected": -136.0052947998047, "loss": 0.568, "rewards/accuracies": 0.0, "rewards/chosen": 5.963321208953857, "rewards/margins": -0.5329799652099609, "rewards/rejected": 6.496301174163818, "step": 1241 }, { "epoch": 0.27, "learning_rate": 9.700774619310744e-06, "logits/chosen": -1.3673175573349, "logits/rejected": -1.3414173126220703, "logps/chosen": -85.69502258300781, "logps/rejected": -54.97632598876953, "loss": 0.6778, "rewards/accuracies": 0.0, "rewards/chosen": 2.2207953929901123, "rewards/margins": -0.2960214614868164, "rewards/rejected": 2.5168168544769287, "step": 1242 }, { "epoch": 0.28, "learning_rate": 9.700163586360438e-06, "logits/chosen": -1.591632604598999, "logits/rejected": -1.4212620258331299, "logps/chosen": -111.08773803710938, "logps/rejected": -95.46956634521484, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 6.22719144821167, "rewards/margins": 2.454854726791382, "rewards/rejected": 3.772336721420288, "step": 1243 }, { "epoch": 0.28, "learning_rate": 9.699551949450412e-06, "logits/chosen": -1.6032681465148926, "logits/rejected": -1.269321084022522, "logps/chosen": -138.97381591796875, "logps/rejected": -16.164104461669922, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": 5.225427150726318, "rewards/margins": 4.810982704162598, "rewards/rejected": 0.41444435715675354, "step": 1244 }, { "epoch": 0.28, "learning_rate": 9.698939708659258e-06, "logits/chosen": -1.5795009136199951, "logits/rejected": -1.5611941814422607, "logps/chosen": -35.79411315917969, "logps/rejected": -44.056453704833984, "loss": 2.5821, "rewards/accuracies": 0.0, "rewards/chosen": 2.0376670360565186, "rewards/margins": -1.0449323654174805, "rewards/rejected": 3.082599401473999, "step": 1245 }, { "epoch": 0.28, "learning_rate": 9.698326864065646e-06, "logits/chosen": -1.4469088315963745, "logits/rejected": -1.3204721212387085, "logps/chosen": -69.9903564453125, "logps/rejected": -117.84359741210938, "loss": 0.4393, "rewards/accuracies": 0.0, "rewards/chosen": 6.287560939788818, "rewards/margins": -0.23106098175048828, "rewards/rejected": 6.518621921539307, "step": 1246 }, { "epoch": 0.28, "learning_rate": 9.697713415748327e-06, "logits/chosen": -1.2282977104187012, "logits/rejected": -1.199982762336731, "logps/chosen": -76.7237548828125, "logps/rejected": -70.73751831054688, "loss": 3.4389, "rewards/accuracies": 1.0, "rewards/chosen": 3.130929708480835, "rewards/margins": 0.2968308925628662, "rewards/rejected": 2.8340988159179688, "step": 1247 }, { "epoch": 0.28, "learning_rate": 9.697099363786127e-06, "logits/chosen": -1.2364860773086548, "logits/rejected": -0.9609298706054688, "logps/chosen": -155.02316284179688, "logps/rejected": -10.146728515625, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": 5.4328813552856445, "rewards/margins": 4.151988983154297, "rewards/rejected": 1.2808923721313477, "step": 1248 }, { "epoch": 0.28, "learning_rate": 9.69648470825795e-06, "logits/chosen": -1.3287426233291626, "logits/rejected": -1.34139084815979, "logps/chosen": -37.501251220703125, "logps/rejected": -81.97575378417969, "loss": 1.9158, "rewards/accuracies": 0.0, "rewards/chosen": 2.0544421672821045, "rewards/margins": -3.094219923019409, "rewards/rejected": 5.148662090301514, "step": 1249 }, { "epoch": 0.28, "learning_rate": 9.695869449242779e-06, "logits/chosen": -1.6596696376800537, "logits/rejected": -1.41422700881958, "logps/chosen": -59.32440948486328, "logps/rejected": -31.17597770690918, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 4.183138370513916, "rewards/margins": 2.7348670959472656, "rewards/rejected": 1.44827139377594, "step": 1250 }, { "epoch": 0.28, "learning_rate": 9.695253586819672e-06, "logits/chosen": -1.08405339717865, "logits/rejected": -0.9078648090362549, "logps/chosen": -62.16792297363281, "logps/rejected": -44.69087219238281, "loss": 0.2687, "rewards/accuracies": 1.0, "rewards/chosen": 1.7744216918945312, "rewards/margins": 0.35617363452911377, "rewards/rejected": 1.4182480573654175, "step": 1251 }, { "epoch": 0.28, "learning_rate": 9.694637121067764e-06, "logits/chosen": -1.03047776222229, "logits/rejected": -1.0351312160491943, "logps/chosen": -25.007719039916992, "logps/rejected": -17.219980239868164, "loss": 1.9683, "rewards/accuracies": 0.0, "rewards/chosen": 0.11587543785572052, "rewards/margins": -0.593784749507904, "rewards/rejected": 0.7096601724624634, "step": 1252 }, { "epoch": 0.28, "learning_rate": 9.694020052066275e-06, "logits/chosen": -1.3361380100250244, "logits/rejected": -1.3774484395980835, "logps/chosen": -31.968740463256836, "logps/rejected": -35.031005859375, "loss": 0.9556, "rewards/accuracies": 0.0, "rewards/chosen": 1.8613992929458618, "rewards/margins": -1.6239110231399536, "rewards/rejected": 3.4853103160858154, "step": 1253 }, { "epoch": 0.28, "learning_rate": 9.693402379894492e-06, "logits/chosen": -1.3864444494247437, "logits/rejected": -1.393398642539978, "logps/chosen": -90.73731994628906, "logps/rejected": -75.01458740234375, "loss": 0.7732, "rewards/accuracies": 1.0, "rewards/chosen": 5.078016757965088, "rewards/margins": 1.307504415512085, "rewards/rejected": 3.770512342453003, "step": 1254 }, { "epoch": 0.28, "learning_rate": 9.692784104631785e-06, "logits/chosen": -1.374982476234436, "logits/rejected": -1.2352041006088257, "logps/chosen": -81.37164306640625, "logps/rejected": -36.83588409423828, "loss": 0.3269, "rewards/accuracies": 1.0, "rewards/chosen": 4.843287944793701, "rewards/margins": 2.5686404705047607, "rewards/rejected": 2.2746474742889404, "step": 1255 }, { "epoch": 0.28, "learning_rate": 9.692165226357603e-06, "logits/chosen": -1.4396013021469116, "logits/rejected": -1.3792617321014404, "logps/chosen": -155.03465270996094, "logps/rejected": -91.88298034667969, "loss": 0.0634, "rewards/accuracies": 1.0, "rewards/chosen": 6.319962978363037, "rewards/margins": 2.0975217819213867, "rewards/rejected": 4.22244119644165, "step": 1256 }, { "epoch": 0.28, "learning_rate": 9.691545745151469e-06, "logits/chosen": -1.3534842729568481, "logits/rejected": -1.315086007118225, "logps/chosen": -78.1563949584961, "logps/rejected": -105.56265258789062, "loss": 0.9523, "rewards/accuracies": 0.0, "rewards/chosen": 2.7987327575683594, "rewards/margins": -0.3471031188964844, "rewards/rejected": 3.1458358764648438, "step": 1257 }, { "epoch": 0.28, "learning_rate": 9.690925661092984e-06, "logits/chosen": -1.2899316549301147, "logits/rejected": -1.18072509765625, "logps/chosen": -48.46430206298828, "logps/rejected": -31.953998565673828, "loss": 0.7226, "rewards/accuracies": 0.0, "rewards/chosen": 3.0375144481658936, "rewards/margins": -1.0096461772918701, "rewards/rejected": 4.047160625457764, "step": 1258 }, { "epoch": 0.28, "learning_rate": 9.690304974261828e-06, "logits/chosen": -1.4097719192504883, "logits/rejected": -1.1063307523727417, "logps/chosen": -108.94020080566406, "logps/rejected": -68.58761596679688, "loss": 0.1987, "rewards/accuracies": 1.0, "rewards/chosen": 5.193473815917969, "rewards/margins": 1.3928205966949463, "rewards/rejected": 3.8006532192230225, "step": 1259 }, { "epoch": 0.28, "learning_rate": 9.689683684737758e-06, "logits/chosen": -1.3843671083450317, "logits/rejected": -1.286493182182312, "logps/chosen": -51.465087890625, "logps/rejected": -79.60829162597656, "loss": 0.234, "rewards/accuracies": 1.0, "rewards/chosen": 2.095888614654541, "rewards/margins": 0.5657585859298706, "rewards/rejected": 1.5301300287246704, "step": 1260 }, { "epoch": 0.28, "learning_rate": 9.68906179260061e-06, "logits/chosen": -1.6771459579467773, "logits/rejected": -1.7339977025985718, "logps/chosen": -67.69229125976562, "logps/rejected": -64.07670593261719, "loss": 0.3877, "rewards/accuracies": 1.0, "rewards/chosen": 2.3231201171875, "rewards/margins": 0.6316970586776733, "rewards/rejected": 1.6914230585098267, "step": 1261 }, { "epoch": 0.28, "learning_rate": 9.688439297930292e-06, "logits/chosen": -1.6696275472640991, "logits/rejected": -1.6213358640670776, "logps/chosen": -78.38533020019531, "logps/rejected": -26.40042495727539, "loss": 0.237, "rewards/accuracies": 1.0, "rewards/chosen": 2.705793857574463, "rewards/margins": 0.664780855178833, "rewards/rejected": 2.04101300239563, "step": 1262 }, { "epoch": 0.28, "learning_rate": 9.687816200806795e-06, "logits/chosen": -1.4500141143798828, "logits/rejected": -1.4626390933990479, "logps/chosen": -49.502601623535156, "logps/rejected": -35.792564392089844, "loss": 0.8485, "rewards/accuracies": 1.0, "rewards/chosen": 1.92822265625, "rewards/margins": 0.06268656253814697, "rewards/rejected": 1.865536093711853, "step": 1263 }, { "epoch": 0.28, "learning_rate": 9.687192501310186e-06, "logits/chosen": -1.2058998346328735, "logits/rejected": -1.2192599773406982, "logps/chosen": -84.28709411621094, "logps/rejected": -100.44755554199219, "loss": 0.7228, "rewards/accuracies": 0.0, "rewards/chosen": 4.382438659667969, "rewards/margins": -1.171727180480957, "rewards/rejected": 5.554165840148926, "step": 1264 }, { "epoch": 0.28, "learning_rate": 9.68656819952061e-06, "logits/chosen": -1.4370883703231812, "logits/rejected": -1.2967232465744019, "logps/chosen": -59.864105224609375, "logps/rejected": -65.22004699707031, "loss": 0.0718, "rewards/accuracies": 1.0, "rewards/chosen": 5.460608005523682, "rewards/margins": 2.2294509410858154, "rewards/rejected": 3.231157064437866, "step": 1265 }, { "epoch": 0.28, "learning_rate": 9.685943295518283e-06, "logits/chosen": -1.3493523597717285, "logits/rejected": -1.2976467609405518, "logps/chosen": -49.89517593383789, "logps/rejected": -35.81591796875, "loss": 0.208, "rewards/accuracies": 1.0, "rewards/chosen": 2.536410093307495, "rewards/margins": 0.6886142492294312, "rewards/rejected": 1.847795844078064, "step": 1266 }, { "epoch": 0.28, "learning_rate": 9.685317789383509e-06, "logits/chosen": -1.1416593790054321, "logits/rejected": -1.0686129331588745, "logps/chosen": -62.275882720947266, "logps/rejected": -79.4307861328125, "loss": 0.3928, "rewards/accuracies": 1.0, "rewards/chosen": 3.611143112182617, "rewards/margins": 0.21923017501831055, "rewards/rejected": 3.3919129371643066, "step": 1267 }, { "epoch": 0.28, "learning_rate": 9.684691681196664e-06, "logits/chosen": -1.42966628074646, "logits/rejected": -1.410285234451294, "logps/chosen": -93.24195098876953, "logps/rejected": -49.21503448486328, "loss": 1.4424, "rewards/accuracies": 1.0, "rewards/chosen": 3.557375431060791, "rewards/margins": 0.4981536865234375, "rewards/rejected": 3.0592217445373535, "step": 1268 }, { "epoch": 0.28, "learning_rate": 9.684064971038196e-06, "logits/chosen": -1.3330209255218506, "logits/rejected": -1.2320939302444458, "logps/chosen": -158.10166931152344, "logps/rejected": -94.95620727539062, "loss": 0.1673, "rewards/accuracies": 1.0, "rewards/chosen": 5.454066753387451, "rewards/margins": 2.1789560317993164, "rewards/rejected": 3.2751107215881348, "step": 1269 }, { "epoch": 0.28, "learning_rate": 9.683437658988642e-06, "logits/chosen": -1.1717073917388916, "logits/rejected": -1.2121981382369995, "logps/chosen": -32.123374938964844, "logps/rejected": -65.9233627319336, "loss": 1.1944, "rewards/accuracies": 0.0, "rewards/chosen": 2.156224012374878, "rewards/margins": -1.43768310546875, "rewards/rejected": 3.593907117843628, "step": 1270 }, { "epoch": 0.28, "learning_rate": 9.682809745128607e-06, "logits/chosen": -1.1829754114151, "logits/rejected": -1.151687741279602, "logps/chosen": -92.71197509765625, "logps/rejected": -46.838218688964844, "loss": 0.5606, "rewards/accuracies": 0.0, "rewards/chosen": 5.11517333984375, "rewards/margins": -0.1824197769165039, "rewards/rejected": 5.297593116760254, "step": 1271 }, { "epoch": 0.28, "learning_rate": 9.682181229538776e-06, "logits/chosen": -1.5541478395462036, "logits/rejected": -1.6099871397018433, "logps/chosen": -135.17428588867188, "logps/rejected": -129.39596557617188, "loss": 1.4334, "rewards/accuracies": 0.0, "rewards/chosen": 5.738166809082031, "rewards/margins": -2.7493762969970703, "rewards/rejected": 8.487543106079102, "step": 1272 }, { "epoch": 0.28, "learning_rate": 9.681552112299914e-06, "logits/chosen": -1.5663371086120605, "logits/rejected": -1.3568447828292847, "logps/chosen": -90.98164367675781, "logps/rejected": -57.28596115112305, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": 6.058537483215332, "rewards/margins": 3.982658863067627, "rewards/rejected": 2.075878620147705, "step": 1273 }, { "epoch": 0.28, "learning_rate": 9.680922393492858e-06, "logits/chosen": -1.1895177364349365, "logits/rejected": -1.0622367858886719, "logps/chosen": -105.76492309570312, "logps/rejected": -58.971275329589844, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": 5.570292949676514, "rewards/margins": 2.445568323135376, "rewards/rejected": 3.1247246265411377, "step": 1274 }, { "epoch": 0.28, "learning_rate": 9.68029207319853e-06, "logits/chosen": -1.1506990194320679, "logits/rejected": -1.1506990194320679, "logps/chosen": -37.86112976074219, "logps/rejected": -37.86112976074219, "loss": 0.8925, "rewards/accuracies": 0.0, "rewards/chosen": 4.423877239227295, "rewards/margins": 0.0, "rewards/rejected": 4.423877239227295, "step": 1275 }, { "epoch": 0.28, "learning_rate": 9.679661151497919e-06, "logits/chosen": -1.3837617635726929, "logits/rejected": -1.3687999248504639, "logps/chosen": -31.45980453491211, "logps/rejected": -55.25593948364258, "loss": 1.4573, "rewards/accuracies": 0.0, "rewards/chosen": 3.1057088375091553, "rewards/margins": -0.7315771579742432, "rewards/rejected": 3.8372859954833984, "step": 1276 }, { "epoch": 0.28, "learning_rate": 9.6790296284721e-06, "logits/chosen": -1.0856964588165283, "logits/rejected": -1.0856964588165283, "logps/chosen": -51.18164825439453, "logps/rejected": -51.18164825439453, "loss": 0.6449, "rewards/accuracies": 0.0, "rewards/chosen": 1.602883219718933, "rewards/margins": 0.0, "rewards/rejected": 1.602883219718933, "step": 1277 }, { "epoch": 0.28, "learning_rate": 9.678397504202222e-06, "logits/chosen": -1.2196247577667236, "logits/rejected": -1.0859545469284058, "logps/chosen": -149.57192993164062, "logps/rejected": -60.157325744628906, "loss": 0.2158, "rewards/accuracies": 1.0, "rewards/chosen": 4.471395969390869, "rewards/margins": 0.7299964427947998, "rewards/rejected": 3.7413995265960693, "step": 1278 }, { "epoch": 0.28, "learning_rate": 9.677764778769512e-06, "logits/chosen": -1.1521430015563965, "logits/rejected": -1.1194162368774414, "logps/chosen": -53.62202453613281, "logps/rejected": -152.26296997070312, "loss": 1.5659, "rewards/accuracies": 1.0, "rewards/chosen": 3.4210472106933594, "rewards/margins": 1.4284201860427856, "rewards/rejected": 1.9926270246505737, "step": 1279 }, { "epoch": 0.28, "learning_rate": 9.677131452255272e-06, "logits/chosen": -1.3611093759536743, "logits/rejected": -1.1891582012176514, "logps/chosen": -66.51031494140625, "logps/rejected": -72.79508972167969, "loss": 3.2558, "rewards/accuracies": 1.0, "rewards/chosen": 2.448254346847534, "rewards/margins": 0.26728129386901855, "rewards/rejected": 2.1809730529785156, "step": 1280 }, { "epoch": 0.28, "learning_rate": 9.676497524740885e-06, "logits/chosen": -1.216177225112915, "logits/rejected": -1.291093111038208, "logps/chosen": -39.240779876708984, "logps/rejected": -103.3406982421875, "loss": 2.032, "rewards/accuracies": 0.0, "rewards/chosen": 2.519987106323242, "rewards/margins": -3.5482993125915527, "rewards/rejected": 6.068286418914795, "step": 1281 }, { "epoch": 0.28, "learning_rate": 9.675862996307808e-06, "logits/chosen": -1.3983138799667358, "logits/rejected": -1.371860146522522, "logps/chosen": -50.30846405029297, "logps/rejected": -48.29450225830078, "loss": 0.5237, "rewards/accuracies": 1.0, "rewards/chosen": 2.9827468395233154, "rewards/margins": 0.25743627548217773, "rewards/rejected": 2.7253105640411377, "step": 1282 }, { "epoch": 0.28, "learning_rate": 9.675227867037576e-06, "logits/chosen": -1.2445021867752075, "logits/rejected": -1.216382622718811, "logps/chosen": -48.58486557006836, "logps/rejected": -48.4099235534668, "loss": 0.4407, "rewards/accuracies": 1.0, "rewards/chosen": 3.599522829055786, "rewards/margins": 0.9896972179412842, "rewards/rejected": 2.609825611114502, "step": 1283 }, { "epoch": 0.28, "learning_rate": 9.674592137011801e-06, "logits/chosen": -1.5272103548049927, "logits/rejected": -1.5546422004699707, "logps/chosen": -77.0565185546875, "logps/rejected": -106.45161437988281, "loss": 2.3273, "rewards/accuracies": 0.0, "rewards/chosen": 6.7640886306762695, "rewards/margins": -3.5764284133911133, "rewards/rejected": 10.340517044067383, "step": 1284 }, { "epoch": 0.28, "learning_rate": 9.673955806312175e-06, "logits/chosen": -1.269559383392334, "logits/rejected": -1.2148394584655762, "logps/chosen": -56.83168029785156, "logps/rejected": -26.50125503540039, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 3.543347120285034, "rewards/margins": 2.2628254890441895, "rewards/rejected": 1.2805217504501343, "step": 1285 }, { "epoch": 0.28, "learning_rate": 9.673318875020463e-06, "logits/chosen": -1.4881703853607178, "logits/rejected": -1.4056724309921265, "logps/chosen": -62.92921447753906, "logps/rejected": -41.359249114990234, "loss": 0.5615, "rewards/accuracies": 1.0, "rewards/chosen": 2.502925157546997, "rewards/margins": 1.5213432312011719, "rewards/rejected": 0.9815818667411804, "step": 1286 }, { "epoch": 0.28, "learning_rate": 9.67268134321851e-06, "logits/chosen": -1.5956857204437256, "logits/rejected": -1.5044140815734863, "logps/chosen": -113.30609130859375, "logps/rejected": -73.09880065917969, "loss": 0.2084, "rewards/accuracies": 1.0, "rewards/chosen": 5.30421781539917, "rewards/margins": 0.6607012748718262, "rewards/rejected": 4.643516540527344, "step": 1287 }, { "epoch": 0.29, "learning_rate": 9.672043210988237e-06, "logits/chosen": -1.3491452932357788, "logits/rejected": -1.3125158548355103, "logps/chosen": -158.60812377929688, "logps/rejected": -139.8681182861328, "loss": 0.9934, "rewards/accuracies": 0.0, "rewards/chosen": 7.149740695953369, "rewards/margins": -1.8385348320007324, "rewards/rejected": 8.988275527954102, "step": 1288 }, { "epoch": 0.29, "learning_rate": 9.671404478411645e-06, "logits/chosen": -1.3217103481292725, "logits/rejected": -1.2432008981704712, "logps/chosen": -45.578224182128906, "logps/rejected": -43.77750015258789, "loss": 1.9684, "rewards/accuracies": 1.0, "rewards/chosen": 2.5531044006347656, "rewards/margins": 0.1428508758544922, "rewards/rejected": 2.4102535247802734, "step": 1289 }, { "epoch": 0.29, "learning_rate": 9.670765145570804e-06, "logits/chosen": -1.4518052339553833, "logits/rejected": -1.4191993474960327, "logps/chosen": -34.67811584472656, "logps/rejected": -64.22685241699219, "loss": 1.1325, "rewards/accuracies": 0.0, "rewards/chosen": 1.1966041326522827, "rewards/margins": -1.3389657735824585, "rewards/rejected": 2.535569906234741, "step": 1290 }, { "epoch": 0.29, "learning_rate": 9.670125212547872e-06, "logits/chosen": -1.5278956890106201, "logits/rejected": -1.4915143251419067, "logps/chosen": -58.81979751586914, "logps/rejected": -57.727699279785156, "loss": 1.0679, "rewards/accuracies": 0.0, "rewards/chosen": 4.151607990264893, "rewards/margins": -0.5645170211791992, "rewards/rejected": 4.716125011444092, "step": 1291 }, { "epoch": 0.29, "learning_rate": 9.669484679425077e-06, "logits/chosen": -1.6227930784225464, "logits/rejected": -1.5314443111419678, "logps/chosen": -70.77141571044922, "logps/rejected": -54.44688415527344, "loss": 0.4852, "rewards/accuracies": 0.0, "rewards/chosen": 3.6000916957855225, "rewards/margins": -0.48244309425354004, "rewards/rejected": 4.0825347900390625, "step": 1292 }, { "epoch": 0.29, "learning_rate": 9.668843546284725e-06, "logits/chosen": -1.2758665084838867, "logits/rejected": -1.264049768447876, "logps/chosen": -49.553138732910156, "logps/rejected": -56.24079132080078, "loss": 0.6718, "rewards/accuracies": 0.0, "rewards/chosen": 3.2391228675842285, "rewards/margins": -0.9698233604431152, "rewards/rejected": 4.208946228027344, "step": 1293 }, { "epoch": 0.29, "learning_rate": 9.668201813209202e-06, "logits/chosen": -1.233606219291687, "logits/rejected": -1.0942238569259644, "logps/chosen": -60.74330139160156, "logps/rejected": -60.28467559814453, "loss": 0.1999, "rewards/accuracies": 1.0, "rewards/chosen": 2.9829843044281006, "rewards/margins": 1.1186044216156006, "rewards/rejected": 1.8643798828125, "step": 1294 }, { "epoch": 0.29, "learning_rate": 9.667559480280968e-06, "logits/chosen": -1.5771458148956299, "logits/rejected": -1.5263715982437134, "logps/chosen": -108.82923889160156, "logps/rejected": -69.67752838134766, "loss": 0.9593, "rewards/accuracies": 1.0, "rewards/chosen": 6.866032600402832, "rewards/margins": 2.600034236907959, "rewards/rejected": 4.265998363494873, "step": 1295 }, { "epoch": 0.29, "learning_rate": 9.66691654758256e-06, "logits/chosen": -1.376306176185608, "logits/rejected": -1.2375491857528687, "logps/chosen": -105.76193237304688, "logps/rejected": -37.17696762084961, "loss": 0.3352, "rewards/accuracies": 1.0, "rewards/chosen": 6.962960720062256, "rewards/margins": 4.573757171630859, "rewards/rejected": 2.3892037868499756, "step": 1296 }, { "epoch": 0.29, "learning_rate": 9.666273015196595e-06, "logits/chosen": -1.2684862613677979, "logits/rejected": -1.2520763874053955, "logps/chosen": -71.40247344970703, "logps/rejected": -25.238008499145508, "loss": 1.0451, "rewards/accuracies": 0.0, "rewards/chosen": 2.318161725997925, "rewards/margins": -1.4656593799591064, "rewards/rejected": 3.7838211059570312, "step": 1297 }, { "epoch": 0.29, "learning_rate": 9.665628883205765e-06, "logits/chosen": -1.4556313753128052, "logits/rejected": -1.4556313753128052, "logps/chosen": -30.418996810913086, "logps/rejected": -30.418996810913086, "loss": 0.6115, "rewards/accuracies": 0.0, "rewards/chosen": 2.6172006130218506, "rewards/margins": 0.0, "rewards/rejected": 2.6172006130218506, "step": 1298 }, { "epoch": 0.29, "learning_rate": 9.66498415169284e-06, "logits/chosen": -1.4521845579147339, "logits/rejected": -1.339227557182312, "logps/chosen": -79.54498291015625, "logps/rejected": -57.27375411987305, "loss": 0.3389, "rewards/accuracies": 1.0, "rewards/chosen": 4.256679534912109, "rewards/margins": 0.0438389778137207, "rewards/rejected": 4.212840557098389, "step": 1299 }, { "epoch": 0.29, "learning_rate": 9.664338820740664e-06, "logits/chosen": -1.5607444047927856, "logits/rejected": -1.4474815130233765, "logps/chosen": -119.5821762084961, "logps/rejected": -63.614097595214844, "loss": 0.7519, "rewards/accuracies": 1.0, "rewards/chosen": 5.829865455627441, "rewards/margins": 2.9372689723968506, "rewards/rejected": 2.892596483230591, "step": 1300 }, { "epoch": 0.29, "learning_rate": 9.663692890432164e-06, "logits/chosen": -1.3403836488723755, "logits/rejected": -1.2760899066925049, "logps/chosen": -54.57837677001953, "logps/rejected": -24.724916458129883, "loss": 1.0106, "rewards/accuracies": 1.0, "rewards/chosen": 3.2765541076660156, "rewards/margins": 0.40780115127563477, "rewards/rejected": 2.868752956390381, "step": 1301 }, { "epoch": 0.29, "learning_rate": 9.663046360850338e-06, "logits/chosen": -1.3263884782791138, "logits/rejected": -1.3368746042251587, "logps/chosen": -67.5126953125, "logps/rejected": -103.07380676269531, "loss": 0.4717, "rewards/accuracies": 0.0, "rewards/chosen": 6.522688388824463, "rewards/margins": -0.40997791290283203, "rewards/rejected": 6.932666301727295, "step": 1302 }, { "epoch": 0.29, "learning_rate": 9.662399232078264e-06, "logits/chosen": -1.4304593801498413, "logits/rejected": -1.248794436454773, "logps/chosen": -199.2066650390625, "logps/rejected": -29.540163040161133, "loss": 0.0772, "rewards/accuracies": 1.0, "rewards/chosen": 5.310641765594482, "rewards/margins": 4.7465033531188965, "rewards/rejected": 0.5641385912895203, "step": 1303 }, { "epoch": 0.29, "learning_rate": 9.661751504199097e-06, "logits/chosen": -1.0819716453552246, "logits/rejected": -1.07297682762146, "logps/chosen": -91.35044860839844, "logps/rejected": -67.6370849609375, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": 7.837062358856201, "rewards/margins": 4.764406204223633, "rewards/rejected": 3.0726563930511475, "step": 1304 }, { "epoch": 0.29, "learning_rate": 9.661103177296069e-06, "logits/chosen": -1.2770142555236816, "logits/rejected": -1.2974883317947388, "logps/chosen": -59.93180847167969, "logps/rejected": -40.56016540527344, "loss": 0.3336, "rewards/accuracies": 1.0, "rewards/chosen": 2.3919456005096436, "rewards/margins": 0.05333590507507324, "rewards/rejected": 2.3386096954345703, "step": 1305 }, { "epoch": 0.29, "learning_rate": 9.660454251452487e-06, "logits/chosen": -1.1202149391174316, "logits/rejected": -0.8743929266929626, "logps/chosen": -51.21131896972656, "logps/rejected": -119.67440795898438, "loss": 0.9685, "rewards/accuracies": 0.0, "rewards/chosen": 1.646508812904358, "rewards/margins": -1.7169891595840454, "rewards/rejected": 3.3634979724884033, "step": 1306 }, { "epoch": 0.29, "learning_rate": 9.659804726751737e-06, "logits/chosen": -1.0768998861312866, "logits/rejected": -0.946659505367279, "logps/chosen": -35.32105255126953, "logps/rejected": -11.136188507080078, "loss": 0.4806, "rewards/accuracies": 1.0, "rewards/chosen": 1.4566220045089722, "rewards/margins": 1.179307460784912, "rewards/rejected": 0.27731457352638245, "step": 1307 }, { "epoch": 0.29, "learning_rate": 9.659154603277283e-06, "logits/chosen": -1.5149790048599243, "logits/rejected": -1.5954155921936035, "logps/chosen": -48.951622009277344, "logps/rejected": -119.80735778808594, "loss": 3.6078, "rewards/accuracies": 0.0, "rewards/chosen": 2.9433510303497314, "rewards/margins": -4.708426475524902, "rewards/rejected": 7.651777744293213, "step": 1308 }, { "epoch": 0.29, "learning_rate": 9.658503881112661e-06, "logits/chosen": -1.146580696105957, "logits/rejected": -1.146580696105957, "logps/chosen": -44.92486572265625, "logps/rejected": -44.92486572265625, "loss": 0.6695, "rewards/accuracies": 0.0, "rewards/chosen": 4.4663238525390625, "rewards/margins": 0.0, "rewards/rejected": 4.4663238525390625, "step": 1309 }, { "epoch": 0.29, "learning_rate": 9.65785256034149e-06, "logits/chosen": -1.575359582901001, "logits/rejected": -1.4866392612457275, "logps/chosen": -66.99974060058594, "logps/rejected": -42.196250915527344, "loss": 0.606, "rewards/accuracies": 1.0, "rewards/chosen": 3.307882785797119, "rewards/margins": 0.11164712905883789, "rewards/rejected": 3.1962356567382812, "step": 1310 }, { "epoch": 0.29, "learning_rate": 9.657200641047462e-06, "logits/chosen": -1.3074917793273926, "logits/rejected": -1.561716079711914, "logps/chosen": -80.72441101074219, "logps/rejected": -111.09706115722656, "loss": 1.3221, "rewards/accuracies": 0.0, "rewards/chosen": 4.21388578414917, "rewards/margins": -1.220536708831787, "rewards/rejected": 5.434422492980957, "step": 1311 }, { "epoch": 0.29, "learning_rate": 9.656548123314346e-06, "logits/chosen": -1.4438294172286987, "logits/rejected": -1.1955713033676147, "logps/chosen": -82.05386352539062, "logps/rejected": -50.74128723144531, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": 4.147510051727295, "rewards/margins": 2.153761386871338, "rewards/rejected": 1.9937485456466675, "step": 1312 }, { "epoch": 0.29, "learning_rate": 9.655895007225992e-06, "logits/chosen": -1.4140981435775757, "logits/rejected": -1.4868046045303345, "logps/chosen": -34.44255447387695, "logps/rejected": -125.38787841796875, "loss": 2.5149, "rewards/accuracies": 0.0, "rewards/chosen": 2.012770891189575, "rewards/margins": -4.837635040283203, "rewards/rejected": 6.850406169891357, "step": 1313 }, { "epoch": 0.29, "learning_rate": 9.655241292866321e-06, "logits/chosen": -1.956304907798767, "logits/rejected": -1.877297282218933, "logps/chosen": -52.95531463623047, "logps/rejected": -177.2889862060547, "loss": 2.6057, "rewards/accuracies": 0.0, "rewards/chosen": 2.115161895751953, "rewards/margins": -5.19348669052124, "rewards/rejected": 7.308648586273193, "step": 1314 }, { "epoch": 0.29, "learning_rate": 9.654586980319335e-06, "logits/chosen": -1.6865699291229248, "logits/rejected": -1.606508493423462, "logps/chosen": -104.31840515136719, "logps/rejected": -96.5484848022461, "loss": 2.2865, "rewards/accuracies": 0.0, "rewards/chosen": 4.4113664627075195, "rewards/margins": -3.898103713989258, "rewards/rejected": 8.309470176696777, "step": 1315 }, { "epoch": 0.29, "learning_rate": 9.653932069669112e-06, "logits/chosen": -1.1030806303024292, "logits/rejected": -1.1030806303024292, "logps/chosen": -22.731035232543945, "logps/rejected": -22.731035232543945, "loss": 0.9691, "rewards/accuracies": 0.0, "rewards/chosen": 1.3897322416305542, "rewards/margins": 0.0, "rewards/rejected": 1.3897322416305542, "step": 1316 }, { "epoch": 0.29, "learning_rate": 9.653276560999805e-06, "logits/chosen": -1.206763744354248, "logits/rejected": -1.1634314060211182, "logps/chosen": -114.166259765625, "logps/rejected": -146.02398681640625, "loss": 0.5918, "rewards/accuracies": 0.0, "rewards/chosen": 6.9022111892700195, "rewards/margins": -0.5778241157531738, "rewards/rejected": 7.480035305023193, "step": 1317 }, { "epoch": 0.29, "learning_rate": 9.652620454395647e-06, "logits/chosen": -1.4290025234222412, "logits/rejected": -1.3989529609680176, "logps/chosen": -66.62574005126953, "logps/rejected": -47.85844039916992, "loss": 0.2581, "rewards/accuracies": 1.0, "rewards/chosen": 2.7853448390960693, "rewards/margins": 0.4136228561401367, "rewards/rejected": 2.3717219829559326, "step": 1318 }, { "epoch": 0.29, "learning_rate": 9.651963749940944e-06, "logits/chosen": -1.3515597581863403, "logits/rejected": -1.2694989442825317, "logps/chosen": -60.95990753173828, "logps/rejected": -50.40786361694336, "loss": 0.0965, "rewards/accuracies": 1.0, "rewards/chosen": 5.4642767906188965, "rewards/margins": 1.9255266189575195, "rewards/rejected": 3.538750171661377, "step": 1319 }, { "epoch": 0.29, "learning_rate": 9.651306447720083e-06, "logits/chosen": -1.5188266038894653, "logits/rejected": -1.2861641645431519, "logps/chosen": -131.7514190673828, "logps/rejected": -63.558319091796875, "loss": 0.3654, "rewards/accuracies": 1.0, "rewards/chosen": 5.417503356933594, "rewards/margins": 3.8054146766662598, "rewards/rejected": 1.6120887994766235, "step": 1320 }, { "epoch": 0.29, "learning_rate": 9.650648547817524e-06, "logits/chosen": -1.503592610359192, "logits/rejected": -1.4440513849258423, "logps/chosen": -30.17937660217285, "logps/rejected": -41.035888671875, "loss": 0.2134, "rewards/accuracies": 1.0, "rewards/chosen": 2.891101598739624, "rewards/margins": 0.6393632888793945, "rewards/rejected": 2.2517383098602295, "step": 1321 }, { "epoch": 0.29, "learning_rate": 9.649990050317806e-06, "logits/chosen": -1.2817420959472656, "logits/rejected": -1.2117812633514404, "logps/chosen": -66.80885314941406, "logps/rejected": -46.229896545410156, "loss": 1.1417, "rewards/accuracies": 0.0, "rewards/chosen": 3.9958884716033936, "rewards/margins": -2.1741745471954346, "rewards/rejected": 6.170063018798828, "step": 1322 }, { "epoch": 0.29, "learning_rate": 9.649330955305547e-06, "logits/chosen": -1.4374736547470093, "logits/rejected": -1.3660705089569092, "logps/chosen": -188.0845947265625, "logps/rejected": -72.90217590332031, "loss": 1.3999, "rewards/accuracies": 0.0, "rewards/chosen": 4.436944484710693, "rewards/margins": -1.900364875793457, "rewards/rejected": 6.33730936050415, "step": 1323 }, { "epoch": 0.29, "learning_rate": 9.648671262865434e-06, "logits/chosen": -1.284613847732544, "logits/rejected": -1.2221250534057617, "logps/chosen": -101.4111099243164, "logps/rejected": -39.55840301513672, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": 5.267577648162842, "rewards/margins": 1.7743425369262695, "rewards/rejected": 3.4932351112365723, "step": 1324 }, { "epoch": 0.29, "learning_rate": 9.648010973082243e-06, "logits/chosen": -1.327561855316162, "logits/rejected": -1.264832854270935, "logps/chosen": -84.78524780273438, "logps/rejected": -81.85539245605469, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": 7.324059963226318, "rewards/margins": 4.879992485046387, "rewards/rejected": 2.4440674781799316, "step": 1325 }, { "epoch": 0.29, "learning_rate": 9.647350086040812e-06, "logits/chosen": -1.0598984956741333, "logits/rejected": -1.0598984956741333, "logps/chosen": -37.209007263183594, "logps/rejected": -37.209007263183594, "loss": 0.795, "rewards/accuracies": 0.0, "rewards/chosen": 4.008184909820557, "rewards/margins": 0.0, "rewards/rejected": 4.008184909820557, "step": 1326 }, { "epoch": 0.29, "learning_rate": 9.646688601826068e-06, "logits/chosen": -1.4606831073760986, "logits/rejected": -1.442149043083191, "logps/chosen": -69.25971984863281, "logps/rejected": -110.36337280273438, "loss": 0.6445, "rewards/accuracies": 0.0, "rewards/chosen": 2.501488447189331, "rewards/margins": -0.8095865249633789, "rewards/rejected": 3.31107497215271, "step": 1327 }, { "epoch": 0.29, "learning_rate": 9.646026520523008e-06, "logits/chosen": -1.1849883794784546, "logits/rejected": -1.158225178718567, "logps/chosen": -79.35603332519531, "logps/rejected": -86.47821807861328, "loss": 0.3036, "rewards/accuracies": 1.0, "rewards/chosen": 3.899115800857544, "rewards/margins": 1.375603437423706, "rewards/rejected": 2.523512363433838, "step": 1328 }, { "epoch": 0.29, "learning_rate": 9.64536384221671e-06, "logits/chosen": -1.6478965282440186, "logits/rejected": -1.7135485410690308, "logps/chosen": -48.66455078125, "logps/rejected": -112.54467010498047, "loss": 2.7447, "rewards/accuracies": 0.0, "rewards/chosen": 2.5046563148498535, "rewards/margins": -3.6500015258789062, "rewards/rejected": 6.15465784072876, "step": 1329 }, { "epoch": 0.29, "learning_rate": 9.644700566992324e-06, "logits/chosen": -1.1662124395370483, "logits/rejected": -1.1662124395370483, "logps/chosen": -88.29234313964844, "logps/rejected": -88.29234313964844, "loss": 0.3983, "rewards/accuracies": 0.0, "rewards/chosen": 2.9661712646484375, "rewards/margins": 0.0, "rewards/rejected": 2.9661712646484375, "step": 1330 }, { "epoch": 0.29, "learning_rate": 9.644036694935083e-06, "logits/chosen": -1.4076915979385376, "logits/rejected": -0.9624401330947876, "logps/chosen": -83.22221374511719, "logps/rejected": -200.62940979003906, "loss": 3.7922, "rewards/accuracies": 0.0, "rewards/chosen": 1.2421951293945312, "rewards/margins": -6.708456516265869, "rewards/rejected": 7.9506516456604, "step": 1331 }, { "epoch": 0.29, "learning_rate": 9.64337222613029e-06, "logits/chosen": -1.3088881969451904, "logits/rejected": -1.1709630489349365, "logps/chosen": -155.20587158203125, "logps/rejected": -163.30162048339844, "loss": 0.2617, "rewards/accuracies": 1.0, "rewards/chosen": 8.488588333129883, "rewards/margins": 0.37883758544921875, "rewards/rejected": 8.109750747680664, "step": 1332 }, { "epoch": 0.3, "learning_rate": 9.642707160663326e-06, "logits/chosen": -1.2250704765319824, "logits/rejected": -1.1406967639923096, "logps/chosen": -98.97676086425781, "logps/rejected": -87.18234252929688, "loss": 0.3484, "rewards/accuracies": 1.0, "rewards/chosen": 5.954713344573975, "rewards/margins": 1.5340285301208496, "rewards/rejected": 4.420684814453125, "step": 1333 }, { "epoch": 0.3, "learning_rate": 9.642041498619655e-06, "logits/chosen": -1.323715090751648, "logits/rejected": -1.0252004861831665, "logps/chosen": -142.8417205810547, "logps/rejected": -8.610623359680176, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 4.584089756011963, "rewards/margins": 3.655247211456299, "rewards/rejected": 0.9288426637649536, "step": 1334 }, { "epoch": 0.3, "learning_rate": 9.64137524008481e-06, "logits/chosen": -1.1863977909088135, "logits/rejected": -1.1259775161743164, "logps/chosen": -38.933746337890625, "logps/rejected": -34.37190246582031, "loss": 1.0089, "rewards/accuracies": 1.0, "rewards/chosen": 1.4619483947753906, "rewards/margins": 0.5357696413993835, "rewards/rejected": 0.9261787533760071, "step": 1335 }, { "epoch": 0.3, "learning_rate": 9.640708385144403e-06, "logits/chosen": -1.0766382217407227, "logits/rejected": -1.1713480949401855, "logps/chosen": -52.434112548828125, "logps/rejected": -78.15360260009766, "loss": 2.3258, "rewards/accuracies": 0.0, "rewards/chosen": 2.8881287574768066, "rewards/margins": -4.192861080169678, "rewards/rejected": 7.080989837646484, "step": 1336 }, { "epoch": 0.3, "learning_rate": 9.640040933884126e-06, "logits/chosen": -1.248274803161621, "logits/rejected": -1.240842580795288, "logps/chosen": -130.6715087890625, "logps/rejected": -81.89089965820312, "loss": 0.6247, "rewards/accuracies": 0.0, "rewards/chosen": 6.513885498046875, "rewards/margins": -0.888397216796875, "rewards/rejected": 7.40228271484375, "step": 1337 }, { "epoch": 0.3, "learning_rate": 9.639372886389743e-06, "logits/chosen": -1.5212382078170776, "logits/rejected": -1.4874911308288574, "logps/chosen": -102.30927276611328, "logps/rejected": -77.95685577392578, "loss": 0.6228, "rewards/accuracies": 1.0, "rewards/chosen": 5.090641021728516, "rewards/margins": 2.005566358566284, "rewards/rejected": 3.0850746631622314, "step": 1338 }, { "epoch": 0.3, "learning_rate": 9.638704242747097e-06, "logits/chosen": -1.7013555765151978, "logits/rejected": -1.5969351530075073, "logps/chosen": -159.63467407226562, "logps/rejected": -42.00595474243164, "loss": 0.144, "rewards/accuracies": 1.0, "rewards/chosen": 7.29518461227417, "rewards/margins": 2.992554187774658, "rewards/rejected": 4.302630424499512, "step": 1339 }, { "epoch": 0.3, "learning_rate": 9.638035003042108e-06, "logits/chosen": -1.391330599784851, "logits/rejected": -1.2159677743911743, "logps/chosen": -65.677490234375, "logps/rejected": -15.150430679321289, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": 3.8038926124572754, "rewards/margins": 3.328859329223633, "rewards/rejected": 0.47503337264060974, "step": 1340 }, { "epoch": 0.3, "learning_rate": 9.637365167360769e-06, "logits/chosen": -1.0692085027694702, "logits/rejected": -1.1019107103347778, "logps/chosen": -45.99789047241211, "logps/rejected": -26.528709411621094, "loss": 0.2609, "rewards/accuracies": 1.0, "rewards/chosen": 1.6552006006240845, "rewards/margins": 0.3797413110733032, "rewards/rejected": 1.2754592895507812, "step": 1341 }, { "epoch": 0.3, "learning_rate": 9.636694735789153e-06, "logits/chosen": -1.7323790788650513, "logits/rejected": -1.7340890169143677, "logps/chosen": -60.74335861206055, "logps/rejected": -64.53217315673828, "loss": 0.361, "rewards/accuracies": 0.0, "rewards/chosen": 3.0512378215789795, "rewards/margins": -0.006138324737548828, "rewards/rejected": 3.0573761463165283, "step": 1342 }, { "epoch": 0.3, "learning_rate": 9.636023708413412e-06, "logits/chosen": -1.3875194787979126, "logits/rejected": -1.3875194787979126, "logps/chosen": -88.90018463134766, "logps/rejected": -88.90018463134766, "loss": 2.2629, "rewards/accuracies": 0.0, "rewards/chosen": 5.895848751068115, "rewards/margins": 0.0, "rewards/rejected": 5.895848751068115, "step": 1343 }, { "epoch": 0.3, "learning_rate": 9.635352085319768e-06, "logits/chosen": -1.3158667087554932, "logits/rejected": -1.2427042722702026, "logps/chosen": -50.97038269042969, "logps/rejected": -49.325843811035156, "loss": 0.1273, "rewards/accuracies": 1.0, "rewards/chosen": 3.8872947692871094, "rewards/margins": 1.268714189529419, "rewards/rejected": 2.6185805797576904, "step": 1344 }, { "epoch": 0.3, "learning_rate": 9.634679866594525e-06, "logits/chosen": -1.2632936239242554, "logits/rejected": -1.2147530317306519, "logps/chosen": -70.51071166992188, "logps/rejected": -22.587486267089844, "loss": 0.5799, "rewards/accuracies": 0.0, "rewards/chosen": 1.3114814758300781, "rewards/margins": -0.7111766338348389, "rewards/rejected": 2.022658109664917, "step": 1345 }, { "epoch": 0.3, "learning_rate": 9.63400705232406e-06, "logits/chosen": -1.575573205947876, "logits/rejected": -1.5519568920135498, "logps/chosen": -52.16810989379883, "logps/rejected": -48.63367462158203, "loss": 0.9457, "rewards/accuracies": 0.0, "rewards/chosen": 1.8592678308486938, "rewards/margins": -1.1108044385910034, "rewards/rejected": 2.9700722694396973, "step": 1346 }, { "epoch": 0.3, "learning_rate": 9.633333642594828e-06, "logits/chosen": -1.1468278169631958, "logits/rejected": -1.0429027080535889, "logps/chosen": -60.418766021728516, "logps/rejected": -30.749595642089844, "loss": 1.7015, "rewards/accuracies": 0.0, "rewards/chosen": 2.0896968841552734, "rewards/margins": -0.6194970607757568, "rewards/rejected": 2.7091939449310303, "step": 1347 }, { "epoch": 0.3, "learning_rate": 9.632659637493362e-06, "logits/chosen": -1.3334155082702637, "logits/rejected": -1.24174165725708, "logps/chosen": -97.06766510009766, "logps/rejected": -96.7738037109375, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": 2.6515495777130127, "rewards/margins": 2.35493540763855, "rewards/rejected": 0.2966140806674957, "step": 1348 }, { "epoch": 0.3, "learning_rate": 9.631985037106268e-06, "logits/chosen": -1.3164058923721313, "logits/rejected": -1.2598423957824707, "logps/chosen": -66.18217468261719, "logps/rejected": -18.080059051513672, "loss": 0.1854, "rewards/accuracies": 1.0, "rewards/chosen": 2.3658478260040283, "rewards/margins": 1.8470760583877563, "rewards/rejected": 0.518771767616272, "step": 1349 }, { "epoch": 0.3, "learning_rate": 9.631309841520233e-06, "logits/chosen": -1.2367006540298462, "logits/rejected": -1.260560154914856, "logps/chosen": -108.43063354492188, "logps/rejected": -48.473419189453125, "loss": 0.9655, "rewards/accuracies": 0.0, "rewards/chosen": 2.8784759044647217, "rewards/margins": -0.0710153579711914, "rewards/rejected": 2.949491262435913, "step": 1350 }, { "epoch": 0.3, "learning_rate": 9.630634050822016e-06, "logits/chosen": -1.3916503190994263, "logits/rejected": -1.5053510665893555, "logps/chosen": -86.32071685791016, "logps/rejected": -91.62037658691406, "loss": 0.4924, "rewards/accuracies": 0.0, "rewards/chosen": 3.6685616970062256, "rewards/margins": -0.49956727027893066, "rewards/rejected": 4.168128967285156, "step": 1351 }, { "epoch": 0.3, "learning_rate": 9.629957665098458e-06, "logits/chosen": -1.244876503944397, "logits/rejected": -1.2366578578948975, "logps/chosen": -48.918212890625, "logps/rejected": -45.669620513916016, "loss": 1.3991, "rewards/accuracies": 0.0, "rewards/chosen": 2.571057081222534, "rewards/margins": -2.7037909030914307, "rewards/rejected": 5.274847984313965, "step": 1352 }, { "epoch": 0.3, "learning_rate": 9.629280684436467e-06, "logits/chosen": -1.3545958995819092, "logits/rejected": -1.3069684505462646, "logps/chosen": -110.94264221191406, "logps/rejected": -62.54841613769531, "loss": 0.2235, "rewards/accuracies": 1.0, "rewards/chosen": 6.713120937347412, "rewards/margins": 2.324375629425049, "rewards/rejected": 4.388745307922363, "step": 1353 }, { "epoch": 0.3, "learning_rate": 9.628603108923037e-06, "logits/chosen": -1.2903300523757935, "logits/rejected": -1.2910447120666504, "logps/chosen": -63.795616149902344, "logps/rejected": -52.38967514038086, "loss": 1.4059, "rewards/accuracies": 1.0, "rewards/chosen": 2.7340247631073, "rewards/margins": 0.11507058143615723, "rewards/rejected": 2.6189541816711426, "step": 1354 }, { "epoch": 0.3, "learning_rate": 9.627924938645234e-06, "logits/chosen": -1.28763747215271, "logits/rejected": -1.3244317770004272, "logps/chosen": -59.58920669555664, "logps/rejected": -53.39934539794922, "loss": 1.3049, "rewards/accuracies": 0.0, "rewards/chosen": 2.0449702739715576, "rewards/margins": -1.147390365600586, "rewards/rejected": 3.1923606395721436, "step": 1355 }, { "epoch": 0.3, "learning_rate": 9.627246173690202e-06, "logits/chosen": -1.3727846145629883, "logits/rejected": -1.3035855293273926, "logps/chosen": -42.18757247924805, "logps/rejected": -61.95652770996094, "loss": 0.4065, "rewards/accuracies": 1.0, "rewards/chosen": 2.2760250568389893, "rewards/margins": 0.14663195610046387, "rewards/rejected": 2.1293931007385254, "step": 1356 }, { "epoch": 0.3, "learning_rate": 9.62656681414516e-06, "logits/chosen": -1.1591997146606445, "logits/rejected": -1.297667145729065, "logps/chosen": -37.37228775024414, "logps/rejected": -123.90495300292969, "loss": 2.6467, "rewards/accuracies": 0.0, "rewards/chosen": 2.156018018722534, "rewards/margins": -5.278508186340332, "rewards/rejected": 7.434525966644287, "step": 1357 }, { "epoch": 0.3, "learning_rate": 9.625886860097406e-06, "logits/chosen": -1.3756732940673828, "logits/rejected": -1.3832316398620605, "logps/chosen": -90.9840316772461, "logps/rejected": -142.87936401367188, "loss": 0.915, "rewards/accuracies": 0.0, "rewards/chosen": 4.916907787322998, "rewards/margins": -0.8348731994628906, "rewards/rejected": 5.751780986785889, "step": 1358 }, { "epoch": 0.3, "learning_rate": 9.62520631163431e-06, "logits/chosen": -1.5014894008636475, "logits/rejected": -1.5021942853927612, "logps/chosen": -51.885772705078125, "logps/rejected": -62.778568267822266, "loss": 0.2422, "rewards/accuracies": 1.0, "rewards/chosen": 2.7619247436523438, "rewards/margins": 0.4881458282470703, "rewards/rejected": 2.2737789154052734, "step": 1359 }, { "epoch": 0.3, "learning_rate": 9.62452516884332e-06, "logits/chosen": -1.4874743223190308, "logits/rejected": -1.4490375518798828, "logps/chosen": -37.02170944213867, "logps/rejected": -141.02142333984375, "loss": 3.4071, "rewards/accuracies": 0.0, "rewards/chosen": 2.111832857131958, "rewards/margins": -6.610434532165527, "rewards/rejected": 8.722267150878906, "step": 1360 }, { "epoch": 0.3, "learning_rate": 9.623843431811964e-06, "logits/chosen": -1.7571731805801392, "logits/rejected": -1.6414766311645508, "logps/chosen": -74.914306640625, "logps/rejected": -80.45097351074219, "loss": 0.6076, "rewards/accuracies": 1.0, "rewards/chosen": 6.615960597991943, "rewards/margins": 2.5480756759643555, "rewards/rejected": 4.067884922027588, "step": 1361 }, { "epoch": 0.3, "learning_rate": 9.623161100627842e-06, "logits/chosen": -1.3746436834335327, "logits/rejected": -1.3746436834335327, "logps/chosen": -29.02143096923828, "logps/rejected": -29.02143096923828, "loss": 0.6426, "rewards/accuracies": 0.0, "rewards/chosen": 1.961402177810669, "rewards/margins": 0.0, "rewards/rejected": 1.961402177810669, "step": 1362 }, { "epoch": 0.3, "learning_rate": 9.622478175378634e-06, "logits/chosen": -1.4486119747161865, "logits/rejected": -1.453582525253296, "logps/chosen": -160.21713256835938, "logps/rejected": -116.13814544677734, "loss": 0.121, "rewards/accuracies": 1.0, "rewards/chosen": 7.069324016571045, "rewards/margins": 1.334902286529541, "rewards/rejected": 5.734421730041504, "step": 1363 }, { "epoch": 0.3, "learning_rate": 9.62179465615209e-06, "logits/chosen": -1.5735639333724976, "logits/rejected": -1.5887246131896973, "logps/chosen": -77.07833862304688, "logps/rejected": -161.33258056640625, "loss": 2.8984, "rewards/accuracies": 0.0, "rewards/chosen": 6.378404140472412, "rewards/margins": -4.397847652435303, "rewards/rejected": 10.776251792907715, "step": 1364 }, { "epoch": 0.3, "learning_rate": 9.621110543036047e-06, "logits/chosen": -1.1627691984176636, "logits/rejected": -1.0265361070632935, "logps/chosen": -63.07516860961914, "logps/rejected": -28.342037200927734, "loss": 0.0683, "rewards/accuracies": 1.0, "rewards/chosen": 3.542510747909546, "rewards/margins": 1.997612476348877, "rewards/rejected": 1.544898271560669, "step": 1365 }, { "epoch": 0.3, "learning_rate": 9.620425836118406e-06, "logits/chosen": -1.4961575269699097, "logits/rejected": -1.5008488893508911, "logps/chosen": -134.73892211914062, "logps/rejected": -112.62079620361328, "loss": 0.3199, "rewards/accuracies": 1.0, "rewards/chosen": 7.022282600402832, "rewards/margins": 0.2625923156738281, "rewards/rejected": 6.759690284729004, "step": 1366 }, { "epoch": 0.3, "learning_rate": 9.619740535487151e-06, "logits/chosen": -1.449528694152832, "logits/rejected": -1.3761003017425537, "logps/chosen": -102.50169372558594, "logps/rejected": -77.27076721191406, "loss": 1.0365, "rewards/accuracies": 1.0, "rewards/chosen": 5.065392971038818, "rewards/margins": 3.1395111083984375, "rewards/rejected": 1.9258819818496704, "step": 1367 }, { "epoch": 0.3, "learning_rate": 9.619054641230343e-06, "logits/chosen": -1.3906266689300537, "logits/rejected": -1.217869520187378, "logps/chosen": -132.17025756835938, "logps/rejected": -37.870235443115234, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 8.180160522460938, "rewards/margins": 5.472015380859375, "rewards/rejected": 2.7081449031829834, "step": 1368 }, { "epoch": 0.3, "learning_rate": 9.618368153436119e-06, "logits/chosen": -1.2119886875152588, "logits/rejected": -1.0945488214492798, "logps/chosen": -124.74312591552734, "logps/rejected": -51.06031799316406, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 4.881656646728516, "rewards/margins": 2.296044111251831, "rewards/rejected": 2.5856125354766846, "step": 1369 }, { "epoch": 0.3, "learning_rate": 9.617681072192688e-06, "logits/chosen": -1.348855972290039, "logits/rejected": -1.2436856031417847, "logps/chosen": -51.162330627441406, "logps/rejected": -23.817041397094727, "loss": 1.2776, "rewards/accuracies": 1.0, "rewards/chosen": 5.098135471343994, "rewards/margins": 4.252007961273193, "rewards/rejected": 0.8461275100708008, "step": 1370 }, { "epoch": 0.3, "learning_rate": 9.616993397588342e-06, "logits/chosen": -1.3704261779785156, "logits/rejected": -1.2467586994171143, "logps/chosen": -58.24005126953125, "logps/rejected": -34.954917907714844, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 3.114737033843994, "rewards/margins": 2.9662346839904785, "rewards/rejected": 0.14850234985351562, "step": 1371 }, { "epoch": 0.3, "learning_rate": 9.61630512971144e-06, "logits/chosen": -1.3532307147979736, "logits/rejected": -1.3532307147979736, "logps/chosen": -47.52323532104492, "logps/rejected": -47.52323532104492, "loss": 0.602, "rewards/accuracies": 0.0, "rewards/chosen": 2.9237430095672607, "rewards/margins": 0.0, "rewards/rejected": 2.9237430095672607, "step": 1372 }, { "epoch": 0.3, "learning_rate": 9.61561626865043e-06, "logits/chosen": -1.1276978254318237, "logits/rejected": -1.157027006149292, "logps/chosen": -91.45751953125, "logps/rejected": -90.51573181152344, "loss": 0.2025, "rewards/accuracies": 1.0, "rewards/chosen": 4.827609539031982, "rewards/margins": 0.7123279571533203, "rewards/rejected": 4.115281581878662, "step": 1373 }, { "epoch": 0.3, "learning_rate": 9.614926814493822e-06, "logits/chosen": -0.9063987731933594, "logits/rejected": -0.8379043340682983, "logps/chosen": -47.87746047973633, "logps/rejected": -38.815460205078125, "loss": 0.3997, "rewards/accuracies": 1.0, "rewards/chosen": 4.232546806335449, "rewards/margins": 1.3417630195617676, "rewards/rejected": 2.8907837867736816, "step": 1374 }, { "epoch": 0.3, "learning_rate": 9.614236767330214e-06, "logits/chosen": -1.1320950984954834, "logits/rejected": -0.996863603591919, "logps/chosen": -142.85409545898438, "logps/rejected": -66.02229309082031, "loss": 0.1646, "rewards/accuracies": 1.0, "rewards/chosen": 5.399655342102051, "rewards/margins": 1.1712737083435059, "rewards/rejected": 4.228381633758545, "step": 1375 }, { "epoch": 0.3, "learning_rate": 9.613546127248272e-06, "logits/chosen": -1.3119350671768188, "logits/rejected": -1.3324346542358398, "logps/chosen": -51.87706756591797, "logps/rejected": -39.3541259765625, "loss": 0.6589, "rewards/accuracies": 1.0, "rewards/chosen": 2.745265245437622, "rewards/margins": 0.3566443920135498, "rewards/rejected": 2.3886208534240723, "step": 1376 }, { "epoch": 0.3, "learning_rate": 9.612854894336746e-06, "logits/chosen": -1.6054991483688354, "logits/rejected": -1.5401169061660767, "logps/chosen": -118.53646850585938, "logps/rejected": -91.2969970703125, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": 7.842193603515625, "rewards/margins": 2.7639999389648438, "rewards/rejected": 5.078193664550781, "step": 1377 }, { "epoch": 0.31, "learning_rate": 9.612163068684453e-06, "logits/chosen": -1.3421982526779175, "logits/rejected": -1.1407419443130493, "logps/chosen": -87.94108581542969, "logps/rejected": -8.65684986114502, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 7.132228374481201, "rewards/margins": 6.251741886138916, "rewards/rejected": 0.8804866075515747, "step": 1378 }, { "epoch": 0.31, "learning_rate": 9.611470650380293e-06, "logits/chosen": -1.7721023559570312, "logits/rejected": -1.6669105291366577, "logps/chosen": -67.53341674804688, "logps/rejected": -33.8900032043457, "loss": 0.1256, "rewards/accuracies": 1.0, "rewards/chosen": 3.9745118618011475, "rewards/margins": 1.2791357040405273, "rewards/rejected": 2.69537615776062, "step": 1379 }, { "epoch": 0.31, "learning_rate": 9.61077763951324e-06, "logits/chosen": -1.1604889631271362, "logits/rejected": -1.1580859422683716, "logps/chosen": -15.066787719726562, "logps/rejected": -2.6447548866271973, "loss": 0.2313, "rewards/accuracies": 1.0, "rewards/chosen": 1.4811756610870361, "rewards/margins": 1.0862287282943726, "rewards/rejected": 0.3949469029903412, "step": 1380 }, { "epoch": 0.31, "learning_rate": 9.610084036172346e-06, "logits/chosen": -1.1386983394622803, "logits/rejected": -1.1234171390533447, "logps/chosen": -43.157196044921875, "logps/rejected": -49.39946746826172, "loss": 0.6733, "rewards/accuracies": 1.0, "rewards/chosen": 1.214434027671814, "rewards/margins": 0.08084440231323242, "rewards/rejected": 1.1335896253585815, "step": 1381 }, { "epoch": 0.31, "learning_rate": 9.609389840446734e-06, "logits/chosen": -1.2401090860366821, "logits/rejected": -1.1837211847305298, "logps/chosen": -28.679855346679688, "logps/rejected": -7.367412567138672, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": 3.331683397293091, "rewards/margins": 2.282627582550049, "rewards/rejected": 1.0490559339523315, "step": 1382 }, { "epoch": 0.31, "learning_rate": 9.60869505242561e-06, "logits/chosen": -1.4180481433868408, "logits/rejected": -1.3164045810699463, "logps/chosen": -46.08269119262695, "logps/rejected": -28.25409698486328, "loss": 2.0957, "rewards/accuracies": 1.0, "rewards/chosen": 4.292527675628662, "rewards/margins": 2.4058852195739746, "rewards/rejected": 1.8866424560546875, "step": 1383 }, { "epoch": 0.31, "learning_rate": 9.60799967219825e-06, "logits/chosen": -1.2497093677520752, "logits/rejected": -1.2952368259429932, "logps/chosen": -41.104061126708984, "logps/rejected": -90.69328308105469, "loss": 0.9953, "rewards/accuracies": 0.0, "rewards/chosen": 2.2620739936828613, "rewards/margins": -0.5984210968017578, "rewards/rejected": 2.860495090484619, "step": 1384 }, { "epoch": 0.31, "learning_rate": 9.607303699854009e-06, "logits/chosen": -1.2652239799499512, "logits/rejected": -1.289299726486206, "logps/chosen": -50.94477462768555, "logps/rejected": -56.11713790893555, "loss": 2.0435, "rewards/accuracies": 0.0, "rewards/chosen": 2.1355855464935303, "rewards/margins": -3.4253151416778564, "rewards/rejected": 5.560900688171387, "step": 1385 }, { "epoch": 0.31, "learning_rate": 9.606607135482318e-06, "logits/chosen": -1.0920206308364868, "logits/rejected": -1.049616813659668, "logps/chosen": -38.49444580078125, "logps/rejected": -56.76578140258789, "loss": 1.2623, "rewards/accuracies": 1.0, "rewards/chosen": 2.2810311317443848, "rewards/margins": 0.5660068988800049, "rewards/rejected": 1.7150242328643799, "step": 1386 }, { "epoch": 0.31, "learning_rate": 9.605909979172683e-06, "logits/chosen": -1.490286111831665, "logits/rejected": -1.1977758407592773, "logps/chosen": -157.0223846435547, "logps/rejected": -58.63738250732422, "loss": 1.7395, "rewards/accuracies": 1.0, "rewards/chosen": 6.552731513977051, "rewards/margins": 5.005401134490967, "rewards/rejected": 1.5473304986953735, "step": 1387 }, { "epoch": 0.31, "learning_rate": 9.60521223101469e-06, "logits/chosen": -1.6567021608352661, "logits/rejected": -1.5688384771347046, "logps/chosen": -109.56745147705078, "logps/rejected": -86.30671691894531, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": 5.674825191497803, "rewards/margins": 3.352226972579956, "rewards/rejected": 2.3225982189178467, "step": 1388 }, { "epoch": 0.31, "learning_rate": 9.604513891097995e-06, "logits/chosen": -1.7651914358139038, "logits/rejected": -1.7669906616210938, "logps/chosen": -46.84038543701172, "logps/rejected": -69.18832397460938, "loss": 0.6661, "rewards/accuracies": 0.0, "rewards/chosen": 3.572563886642456, "rewards/margins": -0.4413735866546631, "rewards/rejected": 4.013937473297119, "step": 1389 }, { "epoch": 0.31, "learning_rate": 9.603814959512334e-06, "logits/chosen": -1.0620110034942627, "logits/rejected": -1.0620110034942627, "logps/chosen": -71.73355102539062, "logps/rejected": -71.73355102539062, "loss": 0.4124, "rewards/accuracies": 0.0, "rewards/chosen": 3.920867919921875, "rewards/margins": 0.0, "rewards/rejected": 3.920867919921875, "step": 1390 }, { "epoch": 0.31, "learning_rate": 9.603115436347519e-06, "logits/chosen": -1.555616855621338, "logits/rejected": -1.5642231702804565, "logps/chosen": -127.72819519042969, "logps/rejected": -175.3825225830078, "loss": 0.9674, "rewards/accuracies": 0.0, "rewards/chosen": 7.105171203613281, "rewards/margins": -1.2047033309936523, "rewards/rejected": 8.309874534606934, "step": 1391 }, { "epoch": 0.31, "learning_rate": 9.602415321693434e-06, "logits/chosen": -1.114081859588623, "logits/rejected": -1.032994031906128, "logps/chosen": -59.4300537109375, "logps/rejected": -35.72490692138672, "loss": 0.5221, "rewards/accuracies": 1.0, "rewards/chosen": 3.0801339149475098, "rewards/margins": 1.0055663585662842, "rewards/rejected": 2.0745675563812256, "step": 1392 }, { "epoch": 0.31, "learning_rate": 9.601714615640046e-06, "logits/chosen": -1.122159719467163, "logits/rejected": -1.1243420839309692, "logps/chosen": -64.3179931640625, "logps/rejected": -73.36463165283203, "loss": 1.1732, "rewards/accuracies": 0.0, "rewards/chosen": 2.6506240367889404, "rewards/margins": -2.2256195545196533, "rewards/rejected": 4.876243591308594, "step": 1393 }, { "epoch": 0.31, "learning_rate": 9.601013318277391e-06, "logits/chosen": -1.173278570175171, "logits/rejected": -1.0917390584945679, "logps/chosen": -59.42448425292969, "logps/rejected": -48.44667053222656, "loss": 0.8777, "rewards/accuracies": 0.0, "rewards/chosen": 1.6028121709823608, "rewards/margins": -1.0144363641738892, "rewards/rejected": 2.61724853515625, "step": 1394 }, { "epoch": 0.31, "learning_rate": 9.600311429695586e-06, "logits/chosen": -1.8555964231491089, "logits/rejected": -1.767562985420227, "logps/chosen": -94.24990844726562, "logps/rejected": -47.27307891845703, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": 3.9214112758636475, "rewards/margins": 1.859980821609497, "rewards/rejected": 2.0614304542541504, "step": 1395 }, { "epoch": 0.31, "learning_rate": 9.59960894998482e-06, "logits/chosen": -1.0836750268936157, "logits/rejected": -0.5530695915222168, "logps/chosen": -31.799402236938477, "logps/rejected": -35.969520568847656, "loss": 0.1966, "rewards/accuracies": 1.0, "rewards/chosen": 2.224057197570801, "rewards/margins": 1.8166368007659912, "rewards/rejected": 0.4074203670024872, "step": 1396 }, { "epoch": 0.31, "learning_rate": 9.598905879235362e-06, "logits/chosen": -1.60420823097229, "logits/rejected": -1.60494065284729, "logps/chosen": -158.37142944335938, "logps/rejected": -152.02224731445312, "loss": 0.1707, "rewards/accuracies": 1.0, "rewards/chosen": 8.40937328338623, "rewards/margins": 0.9728512763977051, "rewards/rejected": 7.436522006988525, "step": 1397 }, { "epoch": 0.31, "learning_rate": 9.598202217537554e-06, "logits/chosen": -1.247291088104248, "logits/rejected": -1.247291088104248, "logps/chosen": -36.11764144897461, "logps/rejected": -36.11764144897461, "loss": 0.9083, "rewards/accuracies": 0.0, "rewards/chosen": 2.0392510890960693, "rewards/margins": 0.0, "rewards/rejected": 2.0392510890960693, "step": 1398 }, { "epoch": 0.31, "learning_rate": 9.597497964981815e-06, "logits/chosen": -1.3323051929473877, "logits/rejected": -1.3432306051254272, "logps/chosen": -2.5824508666992188, "logps/rejected": -13.081103324890137, "loss": 0.2857, "rewards/accuracies": 1.0, "rewards/chosen": 0.48633822798728943, "rewards/margins": 0.280890554189682, "rewards/rejected": 0.20544767379760742, "step": 1399 }, { "epoch": 0.31, "learning_rate": 9.59679312165864e-06, "logits/chosen": -1.1612358093261719, "logits/rejected": -1.139701247215271, "logps/chosen": -67.24708557128906, "logps/rejected": -64.38628387451172, "loss": 0.4574, "rewards/accuracies": 0.0, "rewards/chosen": 2.2079155445098877, "rewards/margins": -0.4021332263946533, "rewards/rejected": 2.610048770904541, "step": 1400 }, { "epoch": 0.31, "learning_rate": 9.596087687658598e-06, "logits/chosen": -1.303189754486084, "logits/rejected": -1.2677582502365112, "logps/chosen": -43.965309143066406, "logps/rejected": -81.37269592285156, "loss": 1.1049, "rewards/accuracies": 0.0, "rewards/chosen": 2.3380966186523438, "rewards/margins": -1.7336640357971191, "rewards/rejected": 4.071760654449463, "step": 1401 }, { "epoch": 0.31, "learning_rate": 9.595381663072335e-06, "logits/chosen": -1.4003193378448486, "logits/rejected": -1.2056355476379395, "logps/chosen": -126.7790298461914, "logps/rejected": -45.86787796020508, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 8.45291805267334, "rewards/margins": 6.574712753295898, "rewards/rejected": 1.8782055377960205, "step": 1402 }, { "epoch": 0.31, "learning_rate": 9.594675047990578e-06, "logits/chosen": -1.21940016746521, "logits/rejected": -1.1836345195770264, "logps/chosen": -91.58555603027344, "logps/rejected": -73.40003204345703, "loss": 0.2016, "rewards/accuracies": 1.0, "rewards/chosen": 5.434933662414551, "rewards/margins": 3.3912689685821533, "rewards/rejected": 2.0436646938323975, "step": 1403 }, { "epoch": 0.31, "learning_rate": 9.593967842504121e-06, "logits/chosen": -1.5386531352996826, "logits/rejected": -1.4991073608398438, "logps/chosen": -40.98001480102539, "logps/rejected": -79.33590698242188, "loss": 0.8546, "rewards/accuracies": 0.0, "rewards/chosen": 3.1791133880615234, "rewards/margins": -1.4685659408569336, "rewards/rejected": 4.647679328918457, "step": 1404 }, { "epoch": 0.31, "learning_rate": 9.593260046703842e-06, "logits/chosen": -1.3585423231124878, "logits/rejected": -1.3585423231124878, "logps/chosen": -53.893470764160156, "logps/rejected": -53.893470764160156, "loss": 0.8002, "rewards/accuracies": 0.0, "rewards/chosen": 3.6669692993164062, "rewards/margins": 0.0, "rewards/rejected": 3.6669692993164062, "step": 1405 }, { "epoch": 0.31, "learning_rate": 9.592551660680687e-06, "logits/chosen": -1.4646704196929932, "logits/rejected": -1.440615177154541, "logps/chosen": -102.01786804199219, "logps/rejected": -117.83411407470703, "loss": 1.4276, "rewards/accuracies": 0.0, "rewards/chosen": 4.7251176834106445, "rewards/margins": -2.7950706481933594, "rewards/rejected": 7.520188331604004, "step": 1406 }, { "epoch": 0.31, "learning_rate": 9.591842684525685e-06, "logits/chosen": -1.537999153137207, "logits/rejected": -1.3606300354003906, "logps/chosen": -144.77847290039062, "logps/rejected": -43.42881774902344, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 6.858325481414795, "rewards/margins": 3.628638744354248, "rewards/rejected": 3.229686737060547, "step": 1407 }, { "epoch": 0.31, "learning_rate": 9.591133118329936e-06, "logits/chosen": -1.3605107069015503, "logits/rejected": -1.3777015209197998, "logps/chosen": -81.8349380493164, "logps/rejected": -49.56148147583008, "loss": 0.3213, "rewards/accuracies": 1.0, "rewards/chosen": 3.471243381500244, "rewards/margins": 0.1853787899017334, "rewards/rejected": 3.2858645915985107, "step": 1408 }, { "epoch": 0.31, "learning_rate": 9.590422962184619e-06, "logits/chosen": -1.257797122001648, "logits/rejected": -1.2256168127059937, "logps/chosen": -31.660680770874023, "logps/rejected": -35.87250518798828, "loss": 0.7049, "rewards/accuracies": 0.0, "rewards/chosen": 1.8438745737075806, "rewards/margins": -0.046503305435180664, "rewards/rejected": 1.8903778791427612, "step": 1409 }, { "epoch": 0.31, "learning_rate": 9.589712216180986e-06, "logits/chosen": -1.5559210777282715, "logits/rejected": -1.4750745296478271, "logps/chosen": -175.18417358398438, "logps/rejected": -44.32865905761719, "loss": 0.1258, "rewards/accuracies": 1.0, "rewards/chosen": 4.822020053863525, "rewards/margins": 1.6584465503692627, "rewards/rejected": 3.1635735034942627, "step": 1410 }, { "epoch": 0.31, "learning_rate": 9.589000880410366e-06, "logits/chosen": -1.5981982946395874, "logits/rejected": -1.5981982946395874, "logps/chosen": -44.605201721191406, "logps/rejected": -44.605201721191406, "loss": 0.3719, "rewards/accuracies": 0.0, "rewards/chosen": 3.096202850341797, "rewards/margins": 0.0, "rewards/rejected": 3.096202850341797, "step": 1411 }, { "epoch": 0.31, "learning_rate": 9.588288954964164e-06, "logits/chosen": -1.1308012008666992, "logits/rejected": -1.1459922790527344, "logps/chosen": -57.542816162109375, "logps/rejected": -101.79776000976562, "loss": 0.8674, "rewards/accuracies": 0.0, "rewards/chosen": 3.339881181716919, "rewards/margins": -1.1235358715057373, "rewards/rejected": 4.463417053222656, "step": 1412 }, { "epoch": 0.31, "learning_rate": 9.587576439933862e-06, "logits/chosen": -1.3869222402572632, "logits/rejected": -1.3869222402572632, "logps/chosen": -52.61506652832031, "logps/rejected": -52.61506652832031, "loss": 0.7112, "rewards/accuracies": 0.0, "rewards/chosen": 2.9880752563476562, "rewards/margins": 0.0, "rewards/rejected": 2.9880752563476562, "step": 1413 }, { "epoch": 0.31, "learning_rate": 9.586863335411017e-06, "logits/chosen": -1.3879852294921875, "logits/rejected": -1.3879852294921875, "logps/chosen": -51.36045837402344, "logps/rejected": -51.36045837402344, "loss": 1.2643, "rewards/accuracies": 0.0, "rewards/chosen": 3.7315011024475098, "rewards/margins": 0.0, "rewards/rejected": 3.7315011024475098, "step": 1414 }, { "epoch": 0.31, "learning_rate": 9.586149641487257e-06, "logits/chosen": -1.2148466110229492, "logits/rejected": -1.1024686098098755, "logps/chosen": -61.746463775634766, "logps/rejected": -19.448686599731445, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 1.826029658317566, "rewards/margins": 1.058980941772461, "rewards/rejected": 0.7670486569404602, "step": 1415 }, { "epoch": 0.31, "learning_rate": 9.585435358254295e-06, "logits/chosen": -1.1121835708618164, "logits/rejected": -1.1901390552520752, "logps/chosen": -36.274681091308594, "logps/rejected": -89.42546081542969, "loss": 0.8387, "rewards/accuracies": 0.0, "rewards/chosen": 1.9653621912002563, "rewards/margins": -0.6724659204483032, "rewards/rejected": 2.6378281116485596, "step": 1416 }, { "epoch": 0.31, "learning_rate": 9.584720485803912e-06, "logits/chosen": -1.0305485725402832, "logits/rejected": -1.0132272243499756, "logps/chosen": -68.92790222167969, "logps/rejected": -73.2587890625, "loss": 1.7072, "rewards/accuracies": 0.0, "rewards/chosen": 2.216745138168335, "rewards/margins": -1.4505987167358398, "rewards/rejected": 3.667343854904175, "step": 1417 }, { "epoch": 0.31, "learning_rate": 9.584005024227967e-06, "logits/chosen": -1.4187638759613037, "logits/rejected": -1.3043826818466187, "logps/chosen": -79.53428649902344, "logps/rejected": -77.8809814453125, "loss": 1.2518, "rewards/accuracies": 0.0, "rewards/chosen": 3.609079122543335, "rewards/margins": -0.4488251209259033, "rewards/rejected": 4.057904243469238, "step": 1418 }, { "epoch": 0.31, "learning_rate": 9.583288973618398e-06, "logits/chosen": -1.254454255104065, "logits/rejected": -0.9596940279006958, "logps/chosen": -76.9136962890625, "logps/rejected": -27.730743408203125, "loss": 0.1173, "rewards/accuracies": 1.0, "rewards/chosen": 3.984201192855835, "rewards/margins": 3.1369450092315674, "rewards/rejected": 0.8472561240196228, "step": 1419 }, { "epoch": 0.31, "learning_rate": 9.582572334067213e-06, "logits/chosen": -1.5972322225570679, "logits/rejected": -1.5237922668457031, "logps/chosen": -97.27049255371094, "logps/rejected": -128.91241455078125, "loss": 0.8306, "rewards/accuracies": 1.0, "rewards/chosen": 6.622406005859375, "rewards/margins": 1.482405185699463, "rewards/rejected": 5.140000820159912, "step": 1420 }, { "epoch": 0.31, "learning_rate": 9.581855105666497e-06, "logits/chosen": -1.4629560708999634, "logits/rejected": -1.3832629919052124, "logps/chosen": -75.16828918457031, "logps/rejected": -93.52146911621094, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": 4.916715145111084, "rewards/margins": 2.7188284397125244, "rewards/rejected": 2.1978867053985596, "step": 1421 }, { "epoch": 0.31, "learning_rate": 9.581137288508417e-06, "logits/chosen": -1.474963665008545, "logits/rejected": -1.5163781642913818, "logps/chosen": -123.15028381347656, "logps/rejected": -135.2652587890625, "loss": 1.2672, "rewards/accuracies": 0.0, "rewards/chosen": 4.194848537445068, "rewards/margins": -2.3832459449768066, "rewards/rejected": 6.578094482421875, "step": 1422 }, { "epoch": 0.31, "learning_rate": 9.580418882685208e-06, "logits/chosen": -1.5761189460754395, "logits/rejected": -1.500693440437317, "logps/chosen": -58.355674743652344, "logps/rejected": -82.02381896972656, "loss": 0.3606, "rewards/accuracies": 1.0, "rewards/chosen": 2.8460617065429688, "rewards/margins": 0.7820174694061279, "rewards/rejected": 2.064044237136841, "step": 1423 }, { "epoch": 0.32, "learning_rate": 9.579699888289184e-06, "logits/chosen": -1.3813879489898682, "logits/rejected": -1.27432119846344, "logps/chosen": -61.23395538330078, "logps/rejected": -42.962867736816406, "loss": 0.195, "rewards/accuracies": 1.0, "rewards/chosen": 1.8406585454940796, "rewards/margins": 0.7655799388885498, "rewards/rejected": 1.0750786066055298, "step": 1424 }, { "epoch": 0.32, "learning_rate": 9.578980305412733e-06, "logits/chosen": -1.458748459815979, "logits/rejected": -1.3731290102005005, "logps/chosen": -76.43506622314453, "logps/rejected": -51.82766342163086, "loss": 0.5064, "rewards/accuracies": 0.0, "rewards/chosen": 2.427478790283203, "rewards/margins": -0.5343365669250488, "rewards/rejected": 2.961815357208252, "step": 1425 }, { "epoch": 0.32, "learning_rate": 9.57826013414832e-06, "logits/chosen": -1.4656482934951782, "logits/rejected": -1.4656482934951782, "logps/chosen": -53.61259460449219, "logps/rejected": -53.61259460449219, "loss": 1.1979, "rewards/accuracies": 0.0, "rewards/chosen": 3.289557695388794, "rewards/margins": 0.0, "rewards/rejected": 3.289557695388794, "step": 1426 }, { "epoch": 0.32, "learning_rate": 9.577539374588486e-06, "logits/chosen": -1.310706377029419, "logits/rejected": -1.2409242391586304, "logps/chosen": -150.59341430664062, "logps/rejected": -73.47773742675781, "loss": 0.0867, "rewards/accuracies": 1.0, "rewards/chosen": 5.008822917938232, "rewards/margins": 1.9055163860321045, "rewards/rejected": 3.103306531906128, "step": 1427 }, { "epoch": 0.32, "learning_rate": 9.576818026825846e-06, "logits/chosen": -1.4082835912704468, "logits/rejected": -1.1730058193206787, "logps/chosen": -48.50908660888672, "logps/rejected": -54.280738830566406, "loss": 0.6863, "rewards/accuracies": 0.0, "rewards/chosen": 3.16960072517395, "rewards/margins": -0.9901688098907471, "rewards/rejected": 4.159769535064697, "step": 1428 }, { "epoch": 0.32, "learning_rate": 9.57609609095309e-06, "logits/chosen": -1.3732960224151611, "logits/rejected": -1.3732960224151611, "logps/chosen": -82.78202056884766, "logps/rejected": -82.78202056884766, "loss": 0.3655, "rewards/accuracies": 0.0, "rewards/chosen": 3.06559681892395, "rewards/margins": 0.0, "rewards/rejected": 3.06559681892395, "step": 1429 }, { "epoch": 0.32, "learning_rate": 9.57537356706299e-06, "logits/chosen": -1.4673478603363037, "logits/rejected": -1.2904953956604004, "logps/chosen": -49.923465728759766, "logps/rejected": -70.9588851928711, "loss": 0.9654, "rewards/accuracies": 0.0, "rewards/chosen": 3.5725224018096924, "rewards/margins": -1.198777437210083, "rewards/rejected": 4.771299839019775, "step": 1430 }, { "epoch": 0.32, "learning_rate": 9.574650455248384e-06, "logits/chosen": -1.6052467823028564, "logits/rejected": -1.5904897451400757, "logps/chosen": -47.81678771972656, "logps/rejected": -55.70180892944336, "loss": 1.3783, "rewards/accuracies": 0.0, "rewards/chosen": 2.590918779373169, "rewards/margins": -1.537996530532837, "rewards/rejected": 4.128915309906006, "step": 1431 }, { "epoch": 0.32, "learning_rate": 9.573926755602194e-06, "logits/chosen": -1.4883487224578857, "logits/rejected": -1.4631223678588867, "logps/chosen": -109.66609191894531, "logps/rejected": -87.93318176269531, "loss": 0.4588, "rewards/accuracies": 0.0, "rewards/chosen": 2.813586473464966, "rewards/margins": -0.40276479721069336, "rewards/rejected": 3.216351270675659, "step": 1432 }, { "epoch": 0.32, "learning_rate": 9.573202468217408e-06, "logits/chosen": -1.1108622550964355, "logits/rejected": -1.0625871419906616, "logps/chosen": -37.62940216064453, "logps/rejected": -53.17918395996094, "loss": 0.3056, "rewards/accuracies": 1.0, "rewards/chosen": 2.9039649963378906, "rewards/margins": 1.3700157403945923, "rewards/rejected": 1.5339492559432983, "step": 1433 }, { "epoch": 0.32, "learning_rate": 9.572477593187101e-06, "logits/chosen": -1.5296801328659058, "logits/rejected": -1.5310426950454712, "logps/chosen": -53.08293914794922, "logps/rejected": -75.01304626464844, "loss": 1.0726, "rewards/accuracies": 0.0, "rewards/chosen": 2.869431257247925, "rewards/margins": -1.185389757156372, "rewards/rejected": 4.054821014404297, "step": 1434 }, { "epoch": 0.32, "learning_rate": 9.571752130604414e-06, "logits/chosen": -1.2369035482406616, "logits/rejected": -1.2369035482406616, "logps/chosen": -43.20844650268555, "logps/rejected": -43.20844650268555, "loss": 0.3752, "rewards/accuracies": 0.0, "rewards/chosen": 1.1016132831573486, "rewards/margins": 0.0, "rewards/rejected": 1.1016132831573486, "step": 1435 }, { "epoch": 0.32, "learning_rate": 9.571026080562569e-06, "logits/chosen": -1.553796648979187, "logits/rejected": -1.4744856357574463, "logps/chosen": -93.7735595703125, "logps/rejected": -49.72821807861328, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 4.943670749664307, "rewards/margins": 3.6598262786865234, "rewards/rejected": 1.2838443517684937, "step": 1436 }, { "epoch": 0.32, "learning_rate": 9.57029944315486e-06, "logits/chosen": -1.52086341381073, "logits/rejected": -1.5252021551132202, "logps/chosen": -66.38187408447266, "logps/rejected": -69.28935241699219, "loss": 0.8624, "rewards/accuracies": 0.0, "rewards/chosen": 3.1490273475646973, "rewards/margins": -0.4237983226776123, "rewards/rejected": 3.5728256702423096, "step": 1437 }, { "epoch": 0.32, "learning_rate": 9.569572218474662e-06, "logits/chosen": -1.7253072261810303, "logits/rejected": -1.5768028497695923, "logps/chosen": -112.86656188964844, "logps/rejected": -64.2898941040039, "loss": 0.1561, "rewards/accuracies": 1.0, "rewards/chosen": 8.653254508972168, "rewards/margins": 4.7004899978637695, "rewards/rejected": 3.9527642726898193, "step": 1438 }, { "epoch": 0.32, "learning_rate": 9.568844406615416e-06, "logits/chosen": -1.0867780447006226, "logits/rejected": -1.3075904846191406, "logps/chosen": -134.07293701171875, "logps/rejected": -116.2333984375, "loss": 0.2225, "rewards/accuracies": 1.0, "rewards/chosen": 7.246203899383545, "rewards/margins": 0.6670961380004883, "rewards/rejected": 6.579107761383057, "step": 1439 }, { "epoch": 0.32, "learning_rate": 9.568116007670647e-06, "logits/chosen": -1.6832455396652222, "logits/rejected": -1.6832455396652222, "logps/chosen": -59.290863037109375, "logps/rejected": -59.290863037109375, "loss": 0.3478, "rewards/accuracies": 0.0, "rewards/chosen": 3.9528000354766846, "rewards/margins": 0.0, "rewards/rejected": 3.9528000354766846, "step": 1440 }, { "epoch": 0.32, "learning_rate": 9.567387021733954e-06, "logits/chosen": -1.223296046257019, "logits/rejected": -1.223296046257019, "logps/chosen": -42.467864990234375, "logps/rejected": -42.467864990234375, "loss": 0.3842, "rewards/accuracies": 0.0, "rewards/chosen": 1.3779277801513672, "rewards/margins": 0.0, "rewards/rejected": 1.3779277801513672, "step": 1441 }, { "epoch": 0.32, "learning_rate": 9.566657448899009e-06, "logits/chosen": -1.3236361742019653, "logits/rejected": -1.3236361742019653, "logps/chosen": -39.73277282714844, "logps/rejected": -39.73277282714844, "loss": 0.5021, "rewards/accuracies": 0.0, "rewards/chosen": 0.7000160217285156, "rewards/margins": 0.0, "rewards/rejected": 0.7000160217285156, "step": 1442 }, { "epoch": 0.32, "learning_rate": 9.565927289259558e-06, "logits/chosen": -1.4397573471069336, "logits/rejected": -1.3877226114273071, "logps/chosen": -119.42974853515625, "logps/rejected": -72.18439483642578, "loss": 0.0648, "rewards/accuracies": 1.0, "rewards/chosen": 6.070324897766113, "rewards/margins": 3.1369752883911133, "rewards/rejected": 2.933349609375, "step": 1443 }, { "epoch": 0.32, "learning_rate": 9.565196542909425e-06, "logits/chosen": -1.2998552322387695, "logits/rejected": -1.2148288488388062, "logps/chosen": -108.18898010253906, "logps/rejected": -62.098907470703125, "loss": 0.3036, "rewards/accuracies": 1.0, "rewards/chosen": 4.082899570465088, "rewards/margins": 0.39031457901000977, "rewards/rejected": 3.692584991455078, "step": 1444 }, { "epoch": 0.32, "learning_rate": 9.564465209942512e-06, "logits/chosen": -1.6006574630737305, "logits/rejected": -1.676522970199585, "logps/chosen": -97.37619018554688, "logps/rejected": -80.91477966308594, "loss": 2.9599, "rewards/accuracies": 0.0, "rewards/chosen": 4.217334270477295, "rewards/margins": -1.5537610054016113, "rewards/rejected": 5.771095275878906, "step": 1445 }, { "epoch": 0.32, "learning_rate": 9.563733290452795e-06, "logits/chosen": -1.6520740985870361, "logits/rejected": -1.5161868333816528, "logps/chosen": -56.44635009765625, "logps/rejected": -16.67255973815918, "loss": 0.3055, "rewards/accuracies": 1.0, "rewards/chosen": 3.001690626144409, "rewards/margins": 1.986657977104187, "rewards/rejected": 1.0150326490402222, "step": 1446 }, { "epoch": 0.32, "learning_rate": 9.56300078453432e-06, "logits/chosen": -1.1671150922775269, "logits/rejected": -1.1671150922775269, "logps/chosen": -46.57276916503906, "logps/rejected": -46.57276916503906, "loss": 0.3614, "rewards/accuracies": 0.0, "rewards/chosen": 2.715857744216919, "rewards/margins": 0.0, "rewards/rejected": 2.715857744216919, "step": 1447 }, { "epoch": 0.32, "learning_rate": 9.562267692281212e-06, "logits/chosen": -1.2482393980026245, "logits/rejected": -1.2042549848556519, "logps/chosen": -34.767333984375, "logps/rejected": -52.46885681152344, "loss": 2.5486, "rewards/accuracies": 0.0, "rewards/chosen": 2.2377777099609375, "rewards/margins": -2.0474276542663574, "rewards/rejected": 4.285205364227295, "step": 1448 }, { "epoch": 0.32, "learning_rate": 9.561534013787671e-06, "logits/chosen": -1.632460117340088, "logits/rejected": -1.5008772611618042, "logps/chosen": -96.32679748535156, "logps/rejected": -24.6135196685791, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 5.543246746063232, "rewards/margins": 5.335718154907227, "rewards/rejected": 0.20752869546413422, "step": 1449 }, { "epoch": 0.32, "learning_rate": 9.560799749147977e-06, "logits/chosen": -1.5217933654785156, "logits/rejected": -1.4819782972335815, "logps/chosen": -52.140235900878906, "logps/rejected": -56.79997253417969, "loss": 0.1962, "rewards/accuracies": 1.0, "rewards/chosen": 3.245497226715088, "rewards/margins": 1.288253903388977, "rewards/rejected": 1.9572433233261108, "step": 1450 }, { "epoch": 0.32, "learning_rate": 9.56006489845648e-06, "logits/chosen": -1.3889800310134888, "logits/rejected": -1.3531434535980225, "logps/chosen": -58.37922286987305, "logps/rejected": -76.42166900634766, "loss": 1.4932, "rewards/accuracies": 0.0, "rewards/chosen": 0.7888935208320618, "rewards/margins": -2.9319615364074707, "rewards/rejected": 3.7208549976348877, "step": 1451 }, { "epoch": 0.32, "learning_rate": 9.559329461807605e-06, "logits/chosen": -1.6360441446304321, "logits/rejected": -1.5283902883529663, "logps/chosen": -59.786861419677734, "logps/rejected": -19.567272186279297, "loss": 0.3808, "rewards/accuracies": 1.0, "rewards/chosen": 1.7976016998291016, "rewards/margins": 0.6127500534057617, "rewards/rejected": 1.1848516464233398, "step": 1452 }, { "epoch": 0.32, "learning_rate": 9.558593439295853e-06, "logits/chosen": -1.3168456554412842, "logits/rejected": -1.3506674766540527, "logps/chosen": -68.61495971679688, "logps/rejected": -60.935997009277344, "loss": 1.6448, "rewards/accuracies": 0.0, "rewards/chosen": 2.01291823387146, "rewards/margins": -2.995803117752075, "rewards/rejected": 5.008721351623535, "step": 1453 }, { "epoch": 0.32, "learning_rate": 9.557856831015805e-06, "logits/chosen": -1.6011136770248413, "logits/rejected": -1.6664707660675049, "logps/chosen": -66.79476165771484, "logps/rejected": -88.23040771484375, "loss": 2.0536, "rewards/accuracies": 0.0, "rewards/chosen": 2.311976671218872, "rewards/margins": -4.0892133712768555, "rewards/rejected": 6.401190280914307, "step": 1454 }, { "epoch": 0.32, "learning_rate": 9.55711963706211e-06, "logits/chosen": -1.4978731870651245, "logits/rejected": -1.5043865442276, "logps/chosen": -58.633907318115234, "logps/rejected": -56.674041748046875, "loss": 0.9541, "rewards/accuracies": 1.0, "rewards/chosen": 2.953826665878296, "rewards/margins": 0.3905186653137207, "rewards/rejected": 2.563308000564575, "step": 1455 }, { "epoch": 0.32, "learning_rate": 9.556381857529497e-06, "logits/chosen": -1.103557825088501, "logits/rejected": -1.0811861753463745, "logps/chosen": -51.28894805908203, "logps/rejected": -64.22008514404297, "loss": 0.5171, "rewards/accuracies": 0.0, "rewards/chosen": 2.6632080078125, "rewards/margins": -0.5527627468109131, "rewards/rejected": 3.215970754623413, "step": 1456 }, { "epoch": 0.32, "learning_rate": 9.555643492512767e-06, "logits/chosen": -1.2302794456481934, "logits/rejected": -1.2090564966201782, "logps/chosen": -79.7175521850586, "logps/rejected": -42.15299987792969, "loss": 0.7304, "rewards/accuracies": 0.0, "rewards/chosen": 1.348504662513733, "rewards/margins": -0.9984489679336548, "rewards/rejected": 2.3469536304473877, "step": 1457 }, { "epoch": 0.32, "learning_rate": 9.554904542106802e-06, "logits/chosen": -1.046975016593933, "logits/rejected": -1.046975016593933, "logps/chosen": -21.326656341552734, "logps/rejected": -21.326656341552734, "loss": 0.4177, "rewards/accuracies": 0.0, "rewards/chosen": 2.3796067237854004, "rewards/margins": 0.0, "rewards/rejected": 2.3796067237854004, "step": 1458 }, { "epoch": 0.32, "learning_rate": 9.55416500640655e-06, "logits/chosen": -1.3593395948410034, "logits/rejected": -1.2038229703903198, "logps/chosen": -135.29754638671875, "logps/rejected": -39.907997131347656, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 7.929360866546631, "rewards/margins": 5.58783483505249, "rewards/rejected": 2.3415260314941406, "step": 1459 }, { "epoch": 0.32, "learning_rate": 9.553424885507045e-06, "logits/chosen": -1.2958046197891235, "logits/rejected": -1.2581838369369507, "logps/chosen": -35.54186248779297, "logps/rejected": -30.28089141845703, "loss": 0.6291, "rewards/accuracies": 1.0, "rewards/chosen": 2.3027169704437256, "rewards/margins": 0.4692772626876831, "rewards/rejected": 1.8334397077560425, "step": 1460 }, { "epoch": 0.32, "learning_rate": 9.552684179503389e-06, "logits/chosen": -1.672569751739502, "logits/rejected": -1.7070029973983765, "logps/chosen": -154.9696807861328, "logps/rejected": -131.5325927734375, "loss": 2.3228, "rewards/accuracies": 0.0, "rewards/chosen": 6.652614116668701, "rewards/margins": -1.941418170928955, "rewards/rejected": 8.594032287597656, "step": 1461 }, { "epoch": 0.32, "learning_rate": 9.551942888490759e-06, "logits/chosen": -1.261101484298706, "logits/rejected": -1.1074531078338623, "logps/chosen": -113.80313873291016, "logps/rejected": -56.40290451049805, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 4.190139293670654, "rewards/margins": 2.3328018188476562, "rewards/rejected": 1.8573375940322876, "step": 1462 }, { "epoch": 0.32, "learning_rate": 9.55120101256441e-06, "logits/chosen": -1.1320096254348755, "logits/rejected": -1.0553547143936157, "logps/chosen": -23.148406982421875, "logps/rejected": -1.878082513809204, "loss": 0.4684, "rewards/accuracies": 1.0, "rewards/chosen": 1.1787079572677612, "rewards/margins": 0.45380598306655884, "rewards/rejected": 0.7249019742012024, "step": 1463 }, { "epoch": 0.32, "learning_rate": 9.550458551819672e-06, "logits/chosen": -1.6202884912490845, "logits/rejected": -1.623354196548462, "logps/chosen": -139.20474243164062, "logps/rejected": -89.38097381591797, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 6.854485988616943, "rewards/margins": 4.314444541931152, "rewards/rejected": 2.540041446685791, "step": 1464 }, { "epoch": 0.32, "learning_rate": 9.54971550635195e-06, "logits/chosen": -1.1439099311828613, "logits/rejected": -1.1345460414886475, "logps/chosen": -5.894054412841797, "logps/rejected": -3.3443334102630615, "loss": 0.2539, "rewards/accuracies": 1.0, "rewards/chosen": 1.7743953466415405, "rewards/margins": 0.41391193866729736, "rewards/rejected": 1.3604834079742432, "step": 1465 }, { "epoch": 0.32, "learning_rate": 9.548971876256721e-06, "logits/chosen": -1.0685691833496094, "logits/rejected": -0.9696155190467834, "logps/chosen": -30.43351173400879, "logps/rejected": -30.830158233642578, "loss": 1.8098, "rewards/accuracies": 1.0, "rewards/chosen": 1.5064924955368042, "rewards/margins": 0.7143023014068604, "rewards/rejected": 0.7921901941299438, "step": 1466 }, { "epoch": 0.32, "learning_rate": 9.548227661629541e-06, "logits/chosen": -1.4797849655151367, "logits/rejected": -1.390850305557251, "logps/chosen": -101.18289184570312, "logps/rejected": -22.541820526123047, "loss": 1.1071, "rewards/accuracies": 1.0, "rewards/chosen": 7.185397624969482, "rewards/margins": 6.060017108917236, "rewards/rejected": 1.1253803968429565, "step": 1467 }, { "epoch": 0.32, "learning_rate": 9.547482862566043e-06, "logits/chosen": -1.3273323774337769, "logits/rejected": -1.3055051565170288, "logps/chosen": -61.465728759765625, "logps/rejected": -110.83218383789062, "loss": 2.0982, "rewards/accuracies": 0.0, "rewards/chosen": 2.6601433753967285, "rewards/margins": -3.95694637298584, "rewards/rejected": 6.617089748382568, "step": 1468 }, { "epoch": 0.33, "learning_rate": 9.546737479161926e-06, "logits/chosen": -1.5664809942245483, "logits/rejected": -1.4024426937103271, "logps/chosen": -147.1484832763672, "logps/rejected": -112.24032592773438, "loss": 0.5106, "rewards/accuracies": 1.0, "rewards/chosen": 4.329102993011475, "rewards/margins": 0.018467426300048828, "rewards/rejected": 4.310635566711426, "step": 1469 }, { "epoch": 0.33, "learning_rate": 9.545991511512975e-06, "logits/chosen": -1.188040852546692, "logits/rejected": -1.1494094133377075, "logps/chosen": -40.65281677246094, "logps/rejected": -16.420316696166992, "loss": 0.0512, "rewards/accuracies": 1.0, "rewards/chosen": 3.0282561779022217, "rewards/margins": 2.2294018268585205, "rewards/rejected": 0.7988542914390564, "step": 1470 }, { "epoch": 0.33, "learning_rate": 9.545244959715041e-06, "logits/chosen": -1.304379940032959, "logits/rejected": -1.4123036861419678, "logps/chosen": -68.08490753173828, "logps/rejected": -103.5234603881836, "loss": 2.4144, "rewards/accuracies": 0.0, "rewards/chosen": 3.118497610092163, "rewards/margins": -3.967067003250122, "rewards/rejected": 7.085564613342285, "step": 1471 }, { "epoch": 0.33, "learning_rate": 9.544497823864058e-06, "logits/chosen": -1.339505672454834, "logits/rejected": -1.271618127822876, "logps/chosen": -57.12194061279297, "logps/rejected": -50.85704803466797, "loss": 0.1174, "rewards/accuracies": 1.0, "rewards/chosen": 3.266468048095703, "rewards/margins": 1.4476585388183594, "rewards/rejected": 1.8188095092773438, "step": 1472 }, { "epoch": 0.33, "learning_rate": 9.543750104056029e-06, "logits/chosen": -1.2083028554916382, "logits/rejected": -1.0825951099395752, "logps/chosen": -72.92081451416016, "logps/rejected": -47.95621109008789, "loss": 0.5483, "rewards/accuracies": 0.0, "rewards/chosen": 4.53444766998291, "rewards/margins": -0.6565899848937988, "rewards/rejected": 5.191037654876709, "step": 1473 }, { "epoch": 0.33, "learning_rate": 9.543001800387034e-06, "logits/chosen": -1.5812113285064697, "logits/rejected": -1.562277913093567, "logps/chosen": -96.1892318725586, "logps/rejected": -154.81886291503906, "loss": 0.1433, "rewards/accuracies": 1.0, "rewards/chosen": 5.991797924041748, "rewards/margins": 1.1542534828186035, "rewards/rejected": 4.8375444412231445, "step": 1474 }, { "epoch": 0.33, "learning_rate": 9.54225291295323e-06, "logits/chosen": -1.3170963525772095, "logits/rejected": -1.2428123950958252, "logps/chosen": -158.82089233398438, "logps/rejected": -41.73550796508789, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 7.258775234222412, "rewards/margins": 5.282708168029785, "rewards/rejected": 1.9760669469833374, "step": 1475 }, { "epoch": 0.33, "learning_rate": 9.541503441850844e-06, "logits/chosen": -1.3360356092453003, "logits/rejected": -1.2893140316009521, "logps/chosen": -101.85861206054688, "logps/rejected": -63.57225036621094, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": 5.694500923156738, "rewards/margins": 2.4531755447387695, "rewards/rejected": 3.2413253784179688, "step": 1476 }, { "epoch": 0.33, "learning_rate": 9.540753387176183e-06, "logits/chosen": -1.8109939098358154, "logits/rejected": -1.8173671960830688, "logps/chosen": -46.917083740234375, "logps/rejected": -75.13017272949219, "loss": 0.8909, "rewards/accuracies": 0.0, "rewards/chosen": 2.6694915294647217, "rewards/margins": -1.1665687561035156, "rewards/rejected": 3.8360602855682373, "step": 1477 }, { "epoch": 0.33, "learning_rate": 9.54000274902563e-06, "logits/chosen": -1.5923335552215576, "logits/rejected": -1.5923335552215576, "logps/chosen": -27.084299087524414, "logps/rejected": -27.084299087524414, "loss": 0.9768, "rewards/accuracies": 0.0, "rewards/chosen": 3.084955930709839, "rewards/margins": 0.0, "rewards/rejected": 3.084955930709839, "step": 1478 }, { "epoch": 0.33, "learning_rate": 9.539251527495636e-06, "logits/chosen": -1.2035785913467407, "logits/rejected": -1.1854788064956665, "logps/chosen": -61.48407745361328, "logps/rejected": -45.229488372802734, "loss": 2.1127, "rewards/accuracies": 0.0, "rewards/chosen": 2.5007057189941406, "rewards/margins": -0.13654446601867676, "rewards/rejected": 2.6372501850128174, "step": 1479 }, { "epoch": 0.33, "learning_rate": 9.538499722682733e-06, "logits/chosen": -1.0354479551315308, "logits/rejected": -1.0535222291946411, "logps/chosen": -57.59351348876953, "logps/rejected": -61.792625427246094, "loss": 0.428, "rewards/accuracies": 0.0, "rewards/chosen": 3.62764048576355, "rewards/margins": -0.1523888111114502, "rewards/rejected": 3.780029296875, "step": 1480 }, { "epoch": 0.33, "learning_rate": 9.537747334683524e-06, "logits/chosen": -1.0477159023284912, "logits/rejected": -1.0477159023284912, "logps/chosen": -12.53435230255127, "logps/rejected": -12.53435230255127, "loss": 0.7806, "rewards/accuracies": 0.0, "rewards/chosen": 1.1219420433044434, "rewards/margins": 0.0, "rewards/rejected": 1.1219420433044434, "step": 1481 }, { "epoch": 0.33, "learning_rate": 9.536994363594694e-06, "logits/chosen": -1.7195217609405518, "logits/rejected": -1.7135571241378784, "logps/chosen": -103.5139389038086, "logps/rejected": -57.88927459716797, "loss": 0.717, "rewards/accuracies": 0.0, "rewards/chosen": 2.0670037269592285, "rewards/margins": -0.32202911376953125, "rewards/rejected": 2.3890328407287598, "step": 1482 }, { "epoch": 0.33, "learning_rate": 9.536240809512994e-06, "logits/chosen": -1.820585012435913, "logits/rejected": -1.8113954067230225, "logps/chosen": -74.4581298828125, "logps/rejected": -76.53739166259766, "loss": 1.0408, "rewards/accuracies": 0.0, "rewards/chosen": 2.486982822418213, "rewards/margins": -0.7238898277282715, "rewards/rejected": 3.2108726501464844, "step": 1483 }, { "epoch": 0.33, "learning_rate": 9.535486672535255e-06, "logits/chosen": -1.410540223121643, "logits/rejected": -1.2886261940002441, "logps/chosen": -70.25250244140625, "logps/rejected": -18.958423614501953, "loss": 0.3923, "rewards/accuracies": 1.0, "rewards/chosen": 4.932838439941406, "rewards/margins": 3.6578369140625, "rewards/rejected": 1.2750015258789062, "step": 1484 }, { "epoch": 0.33, "learning_rate": 9.53473195275838e-06, "logits/chosen": -1.2579805850982666, "logits/rejected": -1.0489468574523926, "logps/chosen": -105.37834167480469, "logps/rejected": -101.23611450195312, "loss": 0.2285, "rewards/accuracies": 1.0, "rewards/chosen": 5.143750190734863, "rewards/margins": 0.5535979270935059, "rewards/rejected": 4.590152263641357, "step": 1485 }, { "epoch": 0.33, "learning_rate": 9.53397665027935e-06, "logits/chosen": -1.2947239875793457, "logits/rejected": -1.3956751823425293, "logps/chosen": -94.11624908447266, "logps/rejected": -201.55389404296875, "loss": 0.4066, "rewards/accuracies": 0.0, "rewards/chosen": 6.487929821014404, "rewards/margins": -0.21650123596191406, "rewards/rejected": 6.704431056976318, "step": 1486 }, { "epoch": 0.33, "learning_rate": 9.533220765195223e-06, "logits/chosen": -1.4421122074127197, "logits/rejected": -1.485102653503418, "logps/chosen": -111.47757720947266, "logps/rejected": -65.21955871582031, "loss": 0.5021, "rewards/accuracies": 1.0, "rewards/chosen": 6.537330150604248, "rewards/margins": 2.239212989807129, "rewards/rejected": 4.298117160797119, "step": 1487 }, { "epoch": 0.33, "learning_rate": 9.532464297603124e-06, "logits/chosen": -1.6965101957321167, "logits/rejected": -1.5128875970840454, "logps/chosen": -120.71769714355469, "logps/rejected": -31.648239135742188, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 6.360487461090088, "rewards/margins": 5.427989959716797, "rewards/rejected": 0.9324974417686462, "step": 1488 }, { "epoch": 0.33, "learning_rate": 9.531707247600258e-06, "logits/chosen": -1.354142189025879, "logits/rejected": -1.2349622249603271, "logps/chosen": -125.07452392578125, "logps/rejected": -67.84941101074219, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 6.901172161102295, "rewards/margins": 3.4074130058288574, "rewards/rejected": 3.4937591552734375, "step": 1489 }, { "epoch": 0.33, "learning_rate": 9.530949615283902e-06, "logits/chosen": -1.0530365705490112, "logits/rejected": -0.9312099814414978, "logps/chosen": -89.86146545410156, "logps/rejected": -42.968994140625, "loss": 0.3681, "rewards/accuracies": 1.0, "rewards/chosen": 4.416778564453125, "rewards/margins": 1.0746490955352783, "rewards/rejected": 3.3421294689178467, "step": 1490 }, { "epoch": 0.33, "learning_rate": 9.530191400751416e-06, "logits/chosen": -1.5166445970535278, "logits/rejected": -1.444834589958191, "logps/chosen": -47.1529655456543, "logps/rejected": -190.30780029296875, "loss": 2.6281, "rewards/accuracies": 0.0, "rewards/chosen": 3.557546615600586, "rewards/margins": -5.204660415649414, "rewards/rejected": 8.76220703125, "step": 1491 }, { "epoch": 0.33, "learning_rate": 9.529432604100223e-06, "logits/chosen": -1.3986849784851074, "logits/rejected": -1.245862603187561, "logps/chosen": -87.62432098388672, "logps/rejected": -97.33548736572266, "loss": 0.2184, "rewards/accuracies": 1.0, "rewards/chosen": 3.6237564086914062, "rewards/margins": 0.7250845432281494, "rewards/rejected": 2.898671865463257, "step": 1492 }, { "epoch": 0.33, "learning_rate": 9.528673225427831e-06, "logits/chosen": -1.078519582748413, "logits/rejected": -1.1223827600479126, "logps/chosen": -30.50979995727539, "logps/rejected": -65.37415313720703, "loss": 0.5214, "rewards/accuracies": 1.0, "rewards/chosen": 2.7939319610595703, "rewards/margins": 0.8709697723388672, "rewards/rejected": 1.9229621887207031, "step": 1493 }, { "epoch": 0.33, "learning_rate": 9.527913264831817e-06, "logits/chosen": -1.3196669816970825, "logits/rejected": -1.2284644842147827, "logps/chosen": -53.22970962524414, "logps/rejected": -40.25043487548828, "loss": 0.8593, "rewards/accuracies": 0.0, "rewards/chosen": 2.3036274909973145, "rewards/margins": -1.5080738067626953, "rewards/rejected": 3.8117012977600098, "step": 1494 }, { "epoch": 0.33, "learning_rate": 9.52715272240983e-06, "logits/chosen": -1.4677562713623047, "logits/rejected": -1.4969117641448975, "logps/chosen": -79.86937713623047, "logps/rejected": -126.90979766845703, "loss": 3.7485, "rewards/accuracies": 0.0, "rewards/chosen": 2.2639663219451904, "rewards/margins": -5.442935943603516, "rewards/rejected": 7.706902503967285, "step": 1495 }, { "epoch": 0.33, "learning_rate": 9.526391598259604e-06, "logits/chosen": -1.3924121856689453, "logits/rejected": -1.280860424041748, "logps/chosen": -47.972320556640625, "logps/rejected": -46.6241340637207, "loss": 0.3833, "rewards/accuracies": 1.0, "rewards/chosen": 3.797560930252075, "rewards/margins": 2.591010332107544, "rewards/rejected": 1.2065505981445312, "step": 1496 }, { "epoch": 0.33, "learning_rate": 9.525629892478936e-06, "logits/chosen": -1.365982174873352, "logits/rejected": -1.219017505645752, "logps/chosen": -63.25064468383789, "logps/rejected": -41.982967376708984, "loss": 0.5748, "rewards/accuracies": 0.0, "rewards/chosen": 2.80204439163208, "rewards/margins": -0.3814711570739746, "rewards/rejected": 3.1835155487060547, "step": 1497 }, { "epoch": 0.33, "learning_rate": 9.524867605165709e-06, "logits/chosen": -1.6372499465942383, "logits/rejected": -1.6908725500106812, "logps/chosen": -86.1075210571289, "logps/rejected": -67.81646728515625, "loss": 0.6572, "rewards/accuracies": 0.0, "rewards/chosen": 2.108435869216919, "rewards/margins": -0.8341879844665527, "rewards/rejected": 2.9426238536834717, "step": 1498 }, { "epoch": 0.33, "learning_rate": 9.52410473641787e-06, "logits/chosen": -1.5068445205688477, "logits/rejected": -1.4876139163970947, "logps/chosen": -49.058143615722656, "logps/rejected": -39.722801208496094, "loss": 1.0532, "rewards/accuracies": 0.0, "rewards/chosen": 0.7548496127128601, "rewards/margins": -1.0838348865509033, "rewards/rejected": 1.8386844396591187, "step": 1499 }, { "epoch": 0.33, "learning_rate": 9.523341286333448e-06, "logits/chosen": -1.6235445737838745, "logits/rejected": -1.6043673753738403, "logps/chosen": -23.62890625, "logps/rejected": -54.95829772949219, "loss": 1.6874, "rewards/accuracies": 0.0, "rewards/chosen": 0.4936216473579407, "rewards/margins": -2.52083158493042, "rewards/rejected": 3.014453172683716, "step": 1500 }, { "epoch": 0.33, "learning_rate": 9.522577255010546e-06, "logits/chosen": -0.9626737833023071, "logits/rejected": -0.9626737833023071, "logps/chosen": -40.98566818237305, "logps/rejected": -40.98566818237305, "loss": 0.7366, "rewards/accuracies": 0.0, "rewards/chosen": 2.366628646850586, "rewards/margins": 0.0, "rewards/rejected": 2.366628646850586, "step": 1501 }, { "epoch": 0.33, "learning_rate": 9.521812642547337e-06, "logits/chosen": -1.6566925048828125, "logits/rejected": -1.6504572629928589, "logps/chosen": -35.053226470947266, "logps/rejected": -60.0777587890625, "loss": 1.2634, "rewards/accuracies": 1.0, "rewards/chosen": 2.7277679443359375, "rewards/margins": 1.6707550287246704, "rewards/rejected": 1.057012915611267, "step": 1502 }, { "epoch": 0.33, "learning_rate": 9.521047449042075e-06, "logits/chosen": -1.1831274032592773, "logits/rejected": -1.1949340105056763, "logps/chosen": -4.844565391540527, "logps/rejected": -19.361637115478516, "loss": 0.629, "rewards/accuracies": 0.0, "rewards/chosen": 0.6275286674499512, "rewards/margins": -0.750731348991394, "rewards/rejected": 1.3782600164413452, "step": 1503 }, { "epoch": 0.33, "learning_rate": 9.520281674593084e-06, "logits/chosen": -1.2872560024261475, "logits/rejected": -1.2476377487182617, "logps/chosen": -63.10469436645508, "logps/rejected": -53.15892028808594, "loss": 0.1279, "rewards/accuracies": 1.0, "rewards/chosen": 2.7540714740753174, "rewards/margins": 1.3197444677352905, "rewards/rejected": 1.4343270063400269, "step": 1504 }, { "epoch": 0.33, "learning_rate": 9.519515319298765e-06, "logits/chosen": -1.995718002319336, "logits/rejected": -1.9181607961654663, "logps/chosen": -97.27656555175781, "logps/rejected": -89.90774536132812, "loss": 0.112, "rewards/accuracies": 1.0, "rewards/chosen": 6.259616374969482, "rewards/margins": 3.5690996646881104, "rewards/rejected": 2.690516710281372, "step": 1505 }, { "epoch": 0.33, "learning_rate": 9.51874838325759e-06, "logits/chosen": -1.6762179136276245, "logits/rejected": -1.502307415008545, "logps/chosen": -89.39710998535156, "logps/rejected": -77.72945404052734, "loss": 0.1484, "rewards/accuracies": 1.0, "rewards/chosen": 6.410408020019531, "rewards/margins": 4.052953243255615, "rewards/rejected": 2.357454776763916, "step": 1506 }, { "epoch": 0.33, "learning_rate": 9.517980866568112e-06, "logits/chosen": -1.2787200212478638, "logits/rejected": -1.2794933319091797, "logps/chosen": -53.18746566772461, "logps/rejected": -43.442169189453125, "loss": 1.9358, "rewards/accuracies": 0.0, "rewards/chosen": 1.1124622821807861, "rewards/margins": -3.1635234355926514, "rewards/rejected": 4.2759857177734375, "step": 1507 }, { "epoch": 0.33, "learning_rate": 9.517212769328952e-06, "logits/chosen": -1.2205919027328491, "logits/rejected": -1.2205919027328491, "logps/chosen": -18.03293228149414, "logps/rejected": -18.03293228149414, "loss": 0.4241, "rewards/accuracies": 0.0, "rewards/chosen": 1.2339051961898804, "rewards/margins": 0.0, "rewards/rejected": 1.2339051961898804, "step": 1508 }, { "epoch": 0.33, "learning_rate": 9.516444091638812e-06, "logits/chosen": -1.290097951889038, "logits/rejected": -1.290097951889038, "logps/chosen": -41.82389831542969, "logps/rejected": -41.82389831542969, "loss": 0.3482, "rewards/accuracies": 0.0, "rewards/chosen": 1.2991164922714233, "rewards/margins": 0.0, "rewards/rejected": 1.2991164922714233, "step": 1509 }, { "epoch": 0.33, "learning_rate": 9.515674833596464e-06, "logits/chosen": -1.476065993309021, "logits/rejected": -1.405001163482666, "logps/chosen": -83.53153991699219, "logps/rejected": -49.398162841796875, "loss": 0.6585, "rewards/accuracies": 1.0, "rewards/chosen": 2.647076368331909, "rewards/margins": 0.24753475189208984, "rewards/rejected": 2.3995416164398193, "step": 1510 }, { "epoch": 0.33, "learning_rate": 9.514904995300754e-06, "logits/chosen": -1.5389775037765503, "logits/rejected": -1.5124329328536987, "logps/chosen": -58.25834655761719, "logps/rejected": -76.67389678955078, "loss": 1.5145, "rewards/accuracies": 0.0, "rewards/chosen": 3.460174560546875, "rewards/margins": -0.4272117614746094, "rewards/rejected": 3.8873863220214844, "step": 1511 }, { "epoch": 0.33, "learning_rate": 9.514134576850605e-06, "logits/chosen": -1.429861068725586, "logits/rejected": -1.158552885055542, "logps/chosen": -109.73172760009766, "logps/rejected": -49.18326187133789, "loss": 1.6222, "rewards/accuracies": 1.0, "rewards/chosen": 5.875854015350342, "rewards/margins": 3.191638708114624, "rewards/rejected": 2.6842153072357178, "step": 1512 }, { "epoch": 0.33, "learning_rate": 9.513363578345014e-06, "logits/chosen": -1.693472981452942, "logits/rejected": -1.6738537549972534, "logps/chosen": -51.79352951049805, "logps/rejected": -63.2950439453125, "loss": 0.5533, "rewards/accuracies": 0.0, "rewards/chosen": 3.4553487300872803, "rewards/margins": -0.6891276836395264, "rewards/rejected": 4.144476413726807, "step": 1513 }, { "epoch": 0.34, "learning_rate": 9.512591999883056e-06, "logits/chosen": -1.4427810907363892, "logits/rejected": -1.417020320892334, "logps/chosen": -65.0430679321289, "logps/rejected": -83.12094116210938, "loss": 0.2387, "rewards/accuracies": 1.0, "rewards/chosen": 2.298778533935547, "rewards/margins": 0.6158438920974731, "rewards/rejected": 1.6829346418380737, "step": 1514 }, { "epoch": 0.34, "learning_rate": 9.511819841563872e-06, "logits/chosen": -1.355104923248291, "logits/rejected": -1.2988158464431763, "logps/chosen": -107.49769592285156, "logps/rejected": -71.27436828613281, "loss": 1.2081, "rewards/accuracies": 0.0, "rewards/chosen": 3.621835470199585, "rewards/margins": -2.2763779163360596, "rewards/rejected": 5.8982133865356445, "step": 1515 }, { "epoch": 0.34, "learning_rate": 9.511047103486685e-06, "logits/chosen": -1.360938549041748, "logits/rejected": -1.317509651184082, "logps/chosen": -38.712242126464844, "logps/rejected": -40.291603088378906, "loss": 0.9022, "rewards/accuracies": 0.0, "rewards/chosen": 1.9657729864120483, "rewards/margins": -0.5032631158828735, "rewards/rejected": 2.469036102294922, "step": 1516 }, { "epoch": 0.34, "learning_rate": 9.510273785750788e-06, "logits/chosen": -1.5015403032302856, "logits/rejected": -1.5341105461120605, "logps/chosen": -54.76457977294922, "logps/rejected": -52.961570739746094, "loss": 1.5307, "rewards/accuracies": 0.0, "rewards/chosen": 2.212059736251831, "rewards/margins": -2.703169584274292, "rewards/rejected": 4.915229320526123, "step": 1517 }, { "epoch": 0.34, "learning_rate": 9.509499888455554e-06, "logits/chosen": -1.7093864679336548, "logits/rejected": -1.7097210884094238, "logps/chosen": -86.08621215820312, "logps/rejected": -48.965999603271484, "loss": 0.7958, "rewards/accuracies": 0.0, "rewards/chosen": 0.7066330313682556, "rewards/margins": -1.2965190410614014, "rewards/rejected": 2.0031521320343018, "step": 1518 }, { "epoch": 0.34, "learning_rate": 9.508725411700424e-06, "logits/chosen": -1.520771861076355, "logits/rejected": -1.515853762626648, "logps/chosen": -44.87849426269531, "logps/rejected": -86.12631225585938, "loss": 1.0714, "rewards/accuracies": 0.0, "rewards/chosen": 2.58442759513855, "rewards/margins": -1.9810540676116943, "rewards/rejected": 4.565481662750244, "step": 1519 }, { "epoch": 0.34, "learning_rate": 9.507950355584917e-06, "logits/chosen": -1.2785521745681763, "logits/rejected": -1.2172008752822876, "logps/chosen": -91.86689758300781, "logps/rejected": -82.267333984375, "loss": 0.343, "rewards/accuracies": 1.0, "rewards/chosen": 2.4885239601135254, "rewards/margins": 0.13470458984375, "rewards/rejected": 2.3538193702697754, "step": 1520 }, { "epoch": 0.34, "learning_rate": 9.507174720208627e-06, "logits/chosen": -1.4642902612686157, "logits/rejected": -1.3119404315948486, "logps/chosen": -68.39552307128906, "logps/rejected": -24.269912719726562, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 3.663832902908325, "rewards/margins": 2.877218246459961, "rewards/rejected": 0.7866145968437195, "step": 1521 }, { "epoch": 0.34, "learning_rate": 9.50639850567122e-06, "logits/chosen": -1.3662837743759155, "logits/rejected": -1.2998172044754028, "logps/chosen": -43.660518646240234, "logps/rejected": -44.735687255859375, "loss": 0.9193, "rewards/accuracies": 0.0, "rewards/chosen": 2.5866901874542236, "rewards/margins": -0.7625575065612793, "rewards/rejected": 3.349247694015503, "step": 1522 }, { "epoch": 0.34, "learning_rate": 9.505621712072437e-06, "logits/chosen": -1.3986759185791016, "logits/rejected": -1.3038873672485352, "logps/chosen": -58.742462158203125, "logps/rejected": -61.69932174682617, "loss": 0.3854, "rewards/accuracies": 1.0, "rewards/chosen": 3.119982957839966, "rewards/margins": 0.09415698051452637, "rewards/rejected": 3.0258259773254395, "step": 1523 }, { "epoch": 0.34, "learning_rate": 9.504844339512096e-06, "logits/chosen": -1.2422192096710205, "logits/rejected": -1.2422192096710205, "logps/chosen": -17.26220703125, "logps/rejected": -17.26220703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.6656074523925781, "rewards/margins": 0.0, "rewards/rejected": 1.6656074523925781, "step": 1524 }, { "epoch": 0.34, "learning_rate": 9.504066388090088e-06, "logits/chosen": -1.1784998178482056, "logits/rejected": -1.1112769842147827, "logps/chosen": -28.99984359741211, "logps/rejected": -26.809682846069336, "loss": 0.1947, "rewards/accuracies": 1.0, "rewards/chosen": 2.1410882472991943, "rewards/margins": 0.7888892889022827, "rewards/rejected": 1.3521989583969116, "step": 1525 }, { "epoch": 0.34, "learning_rate": 9.503287857906374e-06, "logits/chosen": -1.318153977394104, "logits/rejected": -1.2943521738052368, "logps/chosen": -65.45306396484375, "logps/rejected": -39.48685836791992, "loss": 0.1387, "rewards/accuracies": 1.0, "rewards/chosen": 3.2033448219299316, "rewards/margins": 1.2445775270462036, "rewards/rejected": 1.958767294883728, "step": 1526 }, { "epoch": 0.34, "learning_rate": 9.502508749060998e-06, "logits/chosen": -1.4882739782333374, "logits/rejected": -1.4841740131378174, "logps/chosen": -36.52244567871094, "logps/rejected": -29.398916244506836, "loss": 0.4304, "rewards/accuracies": 0.0, "rewards/chosen": 1.3606361150741577, "rewards/margins": -0.03179121017456055, "rewards/rejected": 1.3924273252487183, "step": 1527 }, { "epoch": 0.34, "learning_rate": 9.50172906165407e-06, "logits/chosen": -1.443824291229248, "logits/rejected": -1.383542776107788, "logps/chosen": -142.08311462402344, "logps/rejected": -97.86009216308594, "loss": 0.1165, "rewards/accuracies": 1.0, "rewards/chosen": 7.724483013153076, "rewards/margins": 1.8481006622314453, "rewards/rejected": 5.876382350921631, "step": 1528 }, { "epoch": 0.34, "learning_rate": 9.50094879578578e-06, "logits/chosen": -1.3469209671020508, "logits/rejected": -1.2300807237625122, "logps/chosen": -40.255157470703125, "logps/rejected": -3.8916702270507812, "loss": 0.1755, "rewards/accuracies": 1.0, "rewards/chosen": 2.0866048336029053, "rewards/margins": 1.0980634689331055, "rewards/rejected": 0.9885414242744446, "step": 1529 }, { "epoch": 0.34, "learning_rate": 9.500167951556392e-06, "logits/chosen": -1.2247819900512695, "logits/rejected": -1.117379069328308, "logps/chosen": -84.29269409179688, "logps/rejected": -105.62813568115234, "loss": 0.1284, "rewards/accuracies": 1.0, "rewards/chosen": 4.563592433929443, "rewards/margins": 2.0140492916107178, "rewards/rejected": 2.5495431423187256, "step": 1530 }, { "epoch": 0.34, "learning_rate": 9.499386529066236e-06, "logits/chosen": -1.2902544736862183, "logits/rejected": -1.289789080619812, "logps/chosen": -42.08171081542969, "logps/rejected": -50.91511917114258, "loss": 0.9034, "rewards/accuracies": 0.0, "rewards/chosen": 2.2961792945861816, "rewards/margins": -1.251272201538086, "rewards/rejected": 3.5474514961242676, "step": 1531 }, { "epoch": 0.34, "learning_rate": 9.498604528415731e-06, "logits/chosen": -1.1660112142562866, "logits/rejected": -1.214814305305481, "logps/chosen": -28.05971336364746, "logps/rejected": -45.071903228759766, "loss": 1.8669, "rewards/accuracies": 0.0, "rewards/chosen": 1.319106936454773, "rewards/margins": -3.457958698272705, "rewards/rejected": 4.777065753936768, "step": 1532 }, { "epoch": 0.34, "learning_rate": 9.497821949705356e-06, "logits/chosen": -1.3101304769515991, "logits/rejected": -1.1416796445846558, "logps/chosen": -60.540496826171875, "logps/rejected": -23.641569137573242, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": 3.4085769653320312, "rewards/margins": 3.3780760765075684, "rewards/rejected": 0.03050079382956028, "step": 1533 }, { "epoch": 0.34, "learning_rate": 9.497038793035674e-06, "logits/chosen": -1.526971697807312, "logits/rejected": -1.3576966524124146, "logps/chosen": -57.43008041381836, "logps/rejected": -63.58869171142578, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 5.686713695526123, "rewards/margins": 3.942257881164551, "rewards/rejected": 1.7444556951522827, "step": 1534 }, { "epoch": 0.34, "learning_rate": 9.496255058507318e-06, "logits/chosen": -1.5669026374816895, "logits/rejected": -1.328691840171814, "logps/chosen": -123.18143463134766, "logps/rejected": -65.80996704101562, "loss": 0.0976, "rewards/accuracies": 1.0, "rewards/chosen": 8.098394393920898, "rewards/margins": 3.8618321418762207, "rewards/rejected": 4.236562252044678, "step": 1535 }, { "epoch": 0.34, "learning_rate": 9.495470746220995e-06, "logits/chosen": -1.2777819633483887, "logits/rejected": -1.2584611177444458, "logps/chosen": -120.73275756835938, "logps/rejected": -55.77758026123047, "loss": 0.4129, "rewards/accuracies": 0.0, "rewards/chosen": 5.670942783355713, "rewards/margins": -0.24816608428955078, "rewards/rejected": 5.919108867645264, "step": 1536 }, { "epoch": 0.34, "learning_rate": 9.494685856277488e-06, "logits/chosen": -1.2993698120117188, "logits/rejected": -1.0726044178009033, "logps/chosen": -237.6156768798828, "logps/rejected": -104.2944564819336, "loss": 0.6124, "rewards/accuracies": 1.0, "rewards/chosen": 7.597905158996582, "rewards/margins": 3.4901833534240723, "rewards/rejected": 4.10772180557251, "step": 1537 }, { "epoch": 0.34, "learning_rate": 9.493900388777654e-06, "logits/chosen": -1.2349801063537598, "logits/rejected": -1.3630614280700684, "logps/chosen": -98.84323120117188, "logps/rejected": -129.84909057617188, "loss": 2.2517, "rewards/accuracies": 0.0, "rewards/chosen": 2.2632293701171875, "rewards/margins": -4.45041036605835, "rewards/rejected": 6.713639736175537, "step": 1538 }, { "epoch": 0.34, "learning_rate": 9.493114343822422e-06, "logits/chosen": -1.336669921875, "logits/rejected": -1.141626238822937, "logps/chosen": -44.204830169677734, "logps/rejected": -24.77305793762207, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": 2.2718236446380615, "rewards/margins": 0.9401583671569824, "rewards/rejected": 1.331665277481079, "step": 1539 }, { "epoch": 0.34, "learning_rate": 9.4923277215128e-06, "logits/chosen": -1.4375406503677368, "logits/rejected": -1.3813294172286987, "logps/chosen": -118.41299438476562, "logps/rejected": -67.82386779785156, "loss": 1.8232, "rewards/accuracies": 0.0, "rewards/chosen": 1.1403900384902954, "rewards/margins": -2.899599552154541, "rewards/rejected": 4.039989471435547, "step": 1540 }, { "epoch": 0.34, "learning_rate": 9.491540521949862e-06, "logits/chosen": -1.4772028923034668, "logits/rejected": -1.4820351600646973, "logps/chosen": -70.66877746582031, "logps/rejected": -93.20550537109375, "loss": 0.5339, "rewards/accuracies": 0.0, "rewards/chosen": 2.5520553588867188, "rewards/margins": -0.645575761795044, "rewards/rejected": 3.1976311206817627, "step": 1541 }, { "epoch": 0.34, "learning_rate": 9.490752745234767e-06, "logits/chosen": -1.7240045070648193, "logits/rejected": -1.7464410066604614, "logps/chosen": -125.18436431884766, "logps/rejected": -103.78691864013672, "loss": 1.1873, "rewards/accuracies": 0.0, "rewards/chosen": 5.775689125061035, "rewards/margins": -1.0732955932617188, "rewards/rejected": 6.848984718322754, "step": 1542 }, { "epoch": 0.34, "learning_rate": 9.489964391468739e-06, "logits/chosen": -1.4387471675872803, "logits/rejected": -1.3819591999053955, "logps/chosen": -119.91123962402344, "logps/rejected": -83.4872055053711, "loss": 0.4237, "rewards/accuracies": 0.0, "rewards/chosen": 5.791206359863281, "rewards/margins": -0.2675285339355469, "rewards/rejected": 6.058734893798828, "step": 1543 }, { "epoch": 0.34, "learning_rate": 9.48917546075308e-06, "logits/chosen": -1.4623730182647705, "logits/rejected": -1.2901962995529175, "logps/chosen": -86.06884765625, "logps/rejected": -37.026588439941406, "loss": 0.197, "rewards/accuracies": 1.0, "rewards/chosen": 7.114596843719482, "rewards/margins": 5.6184821128845215, "rewards/rejected": 1.496114730834961, "step": 1544 }, { "epoch": 0.34, "learning_rate": 9.488385953189165e-06, "logits/chosen": -1.4126065969467163, "logits/rejected": -1.4126065969467163, "logps/chosen": -48.592247009277344, "logps/rejected": -48.592247009277344, "loss": 0.5607, "rewards/accuracies": 0.0, "rewards/chosen": 4.11005163192749, "rewards/margins": 0.0, "rewards/rejected": 4.11005163192749, "step": 1545 }, { "epoch": 0.34, "learning_rate": 9.487595868878447e-06, "logits/chosen": -1.2993290424346924, "logits/rejected": -1.3621423244476318, "logps/chosen": -78.80985260009766, "logps/rejected": -86.54685974121094, "loss": 1.8425, "rewards/accuracies": 0.0, "rewards/chosen": 4.033489227294922, "rewards/margins": -3.6421380043029785, "rewards/rejected": 7.6756272315979, "step": 1546 }, { "epoch": 0.34, "learning_rate": 9.486805207922445e-06, "logits/chosen": -1.0529918670654297, "logits/rejected": -1.0529918670654297, "logps/chosen": -40.07855224609375, "logps/rejected": -40.07855224609375, "loss": 0.7514, "rewards/accuracies": 0.0, "rewards/chosen": 1.528490424156189, "rewards/margins": 0.0, "rewards/rejected": 1.528490424156189, "step": 1547 }, { "epoch": 0.34, "learning_rate": 9.486013970422762e-06, "logits/chosen": -1.4424686431884766, "logits/rejected": -1.4596898555755615, "logps/chosen": -49.8412971496582, "logps/rejected": -81.26974487304688, "loss": 0.6863, "rewards/accuracies": 0.0, "rewards/chosen": 2.9557881355285645, "rewards/margins": -1.042675256729126, "rewards/rejected": 3.9984633922576904, "step": 1548 }, { "epoch": 0.34, "learning_rate": 9.485222156481067e-06, "logits/chosen": -1.3437899351119995, "logits/rejected": -1.2641125917434692, "logps/chosen": -51.208091735839844, "logps/rejected": -23.647449493408203, "loss": 0.3651, "rewards/accuracies": 1.0, "rewards/chosen": 2.897240400314331, "rewards/margins": 0.006880760192871094, "rewards/rejected": 2.89035964012146, "step": 1549 }, { "epoch": 0.34, "learning_rate": 9.484429766199107e-06, "logits/chosen": -1.3312368392944336, "logits/rejected": -1.2997829914093018, "logps/chosen": -182.3723907470703, "logps/rejected": -96.01075744628906, "loss": 1.5107, "rewards/accuracies": 0.0, "rewards/chosen": 6.513469219207764, "rewards/margins": -2.9246668815612793, "rewards/rejected": 9.438136100769043, "step": 1550 }, { "epoch": 0.34, "learning_rate": 9.483636799678703e-06, "logits/chosen": -1.515538215637207, "logits/rejected": -1.3481324911117554, "logps/chosen": -137.6666717529297, "logps/rejected": -49.270790100097656, "loss": 0.7829, "rewards/accuracies": 1.0, "rewards/chosen": 6.080445766448975, "rewards/margins": 4.408822536468506, "rewards/rejected": 1.6716232299804688, "step": 1551 }, { "epoch": 0.34, "learning_rate": 9.482843257021747e-06, "logits/chosen": -0.9736482501029968, "logits/rejected": -0.9692278504371643, "logps/chosen": -32.69709396362305, "logps/rejected": -42.1295166015625, "loss": 0.744, "rewards/accuracies": 0.0, "rewards/chosen": 1.7946285009384155, "rewards/margins": -0.1879863739013672, "rewards/rejected": 1.9826148748397827, "step": 1552 }, { "epoch": 0.34, "learning_rate": 9.48204913833021e-06, "logits/chosen": -1.2791976928710938, "logits/rejected": -1.1980611085891724, "logps/chosen": -107.43610382080078, "logps/rejected": -83.02041625976562, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": 5.806844234466553, "rewards/margins": 1.8249037265777588, "rewards/rejected": 3.981940507888794, "step": 1553 }, { "epoch": 0.34, "learning_rate": 9.481254443706133e-06, "logits/chosen": -1.0659881830215454, "logits/rejected": -0.9364656805992126, "logps/chosen": -84.83023071289062, "logps/rejected": -49.68318176269531, "loss": 1.5801, "rewards/accuracies": 1.0, "rewards/chosen": 4.708996772766113, "rewards/margins": 1.026094913482666, "rewards/rejected": 3.6829018592834473, "step": 1554 }, { "epoch": 0.34, "learning_rate": 9.480459173251634e-06, "logits/chosen": -1.5124865770339966, "logits/rejected": -1.5723167657852173, "logps/chosen": -59.905879974365234, "logps/rejected": -107.99276733398438, "loss": 0.8681, "rewards/accuracies": 0.0, "rewards/chosen": 2.7666897773742676, "rewards/margins": -1.452641487121582, "rewards/rejected": 4.21933126449585, "step": 1555 }, { "epoch": 0.34, "learning_rate": 9.4796633270689e-06, "logits/chosen": -1.29225754737854, "logits/rejected": -1.2028602361679077, "logps/chosen": -85.4747314453125, "logps/rejected": -118.35736083984375, "loss": 1.1966, "rewards/accuracies": 0.0, "rewards/chosen": 4.957296848297119, "rewards/margins": -2.2743988037109375, "rewards/rejected": 7.231695652008057, "step": 1556 }, { "epoch": 0.34, "learning_rate": 9.478866905260198e-06, "logits/chosen": -1.5319243669509888, "logits/rejected": -1.5076837539672852, "logps/chosen": -84.7738037109375, "logps/rejected": -83.20731353759766, "loss": 0.6045, "rewards/accuracies": 1.0, "rewards/chosen": 4.717886447906494, "rewards/margins": 0.020618438720703125, "rewards/rejected": 4.697268009185791, "step": 1557 }, { "epoch": 0.34, "learning_rate": 9.478069907927867e-06, "logits/chosen": -1.0182818174362183, "logits/rejected": -0.8507547378540039, "logps/chosen": -51.868892669677734, "logps/rejected": -47.05242919921875, "loss": 0.347, "rewards/accuracies": 1.0, "rewards/chosen": 4.066882610321045, "rewards/margins": 1.2257814407348633, "rewards/rejected": 2.8411011695861816, "step": 1558 }, { "epoch": 0.35, "learning_rate": 9.477272335174315e-06, "logits/chosen": -1.37739098072052, "logits/rejected": -1.2706996202468872, "logps/chosen": -60.648353576660156, "logps/rejected": -19.73126983642578, "loss": 0.2156, "rewards/accuracies": 1.0, "rewards/chosen": 1.6918327808380127, "rewards/margins": 1.5331852436065674, "rewards/rejected": 0.1586475372314453, "step": 1559 }, { "epoch": 0.35, "learning_rate": 9.476474187102033e-06, "logits/chosen": -1.2186557054519653, "logits/rejected": -1.1886706352233887, "logps/chosen": -55.799530029296875, "logps/rejected": -62.64745330810547, "loss": 0.8531, "rewards/accuracies": 0.0, "rewards/chosen": 1.197731852531433, "rewards/margins": -0.09903335571289062, "rewards/rejected": 1.2967652082443237, "step": 1560 }, { "epoch": 0.35, "learning_rate": 9.475675463813578e-06, "logits/chosen": -1.276779055595398, "logits/rejected": -1.3719589710235596, "logps/chosen": -83.86211395263672, "logps/rejected": -102.68135070800781, "loss": 1.9747, "rewards/accuracies": 0.0, "rewards/chosen": 2.190359592437744, "rewards/margins": -3.927304267883301, "rewards/rejected": 6.117663860321045, "step": 1561 }, { "epoch": 0.35, "learning_rate": 9.474876165411586e-06, "logits/chosen": -1.4442272186279297, "logits/rejected": -1.4941318035125732, "logps/chosen": -55.404815673828125, "logps/rejected": -74.2254409790039, "loss": 1.7956, "rewards/accuracies": 0.0, "rewards/chosen": 2.90740966796875, "rewards/margins": -3.2912344932556152, "rewards/rejected": 6.198644161224365, "step": 1562 }, { "epoch": 0.35, "learning_rate": 9.474076291998765e-06, "logits/chosen": -1.1928157806396484, "logits/rejected": -1.173221230506897, "logps/chosen": -38.0751838684082, "logps/rejected": -38.6337890625, "loss": 0.5582, "rewards/accuracies": 1.0, "rewards/chosen": 2.6626102924346924, "rewards/margins": 1.2954517602920532, "rewards/rejected": 1.3671585321426392, "step": 1563 }, { "epoch": 0.35, "learning_rate": 9.473275843677893e-06, "logits/chosen": -1.584847092628479, "logits/rejected": -1.5272328853607178, "logps/chosen": -95.79804992675781, "logps/rejected": -32.57624053955078, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 6.280113220214844, "rewards/margins": 4.718242645263672, "rewards/rejected": 1.5618705749511719, "step": 1564 }, { "epoch": 0.35, "learning_rate": 9.472474820551831e-06, "logits/chosen": -1.3438374996185303, "logits/rejected": -1.3438374996185303, "logps/chosen": -11.823087692260742, "logps/rejected": -11.823087692260742, "loss": 0.6111, "rewards/accuracies": 0.0, "rewards/chosen": 1.9868046045303345, "rewards/margins": 0.0, "rewards/rejected": 1.9868046045303345, "step": 1565 }, { "epoch": 0.35, "learning_rate": 9.471673222723506e-06, "logits/chosen": -1.21247136592865, "logits/rejected": -1.0407108068466187, "logps/chosen": -57.73046112060547, "logps/rejected": -14.446147918701172, "loss": 0.1291, "rewards/accuracies": 1.0, "rewards/chosen": 3.339244842529297, "rewards/margins": 2.743591070175171, "rewards/rejected": 0.5956537127494812, "step": 1566 }, { "epoch": 0.35, "learning_rate": 9.47087105029592e-06, "logits/chosen": -1.498191237449646, "logits/rejected": -1.5181639194488525, "logps/chosen": -68.18286895751953, "logps/rejected": -30.523235321044922, "loss": 0.9299, "rewards/accuracies": 0.0, "rewards/chosen": 1.759131669998169, "rewards/margins": -1.6893246173858643, "rewards/rejected": 3.448456287384033, "step": 1567 }, { "epoch": 0.35, "learning_rate": 9.470068303372153e-06, "logits/chosen": -1.5966027975082397, "logits/rejected": -1.5176632404327393, "logps/chosen": -121.66061401367188, "logps/rejected": -109.86345672607422, "loss": 1.1607, "rewards/accuracies": 0.0, "rewards/chosen": 4.743309020996094, "rewards/margins": -2.1823782920837402, "rewards/rejected": 6.925687313079834, "step": 1568 }, { "epoch": 0.35, "learning_rate": 9.469264982055355e-06, "logits/chosen": -1.332247018814087, "logits/rejected": -1.3281972408294678, "logps/chosen": -54.7479248046875, "logps/rejected": -41.20856475830078, "loss": 0.5258, "rewards/accuracies": 0.0, "rewards/chosen": 2.644627332687378, "rewards/margins": -0.5698151588439941, "rewards/rejected": 3.214442491531372, "step": 1569 }, { "epoch": 0.35, "learning_rate": 9.46846108644875e-06, "logits/chosen": -1.4934440851211548, "logits/rejected": -1.5286800861358643, "logps/chosen": -99.05690002441406, "logps/rejected": -103.33242797851562, "loss": 1.0253, "rewards/accuracies": 0.0, "rewards/chosen": 6.550698757171631, "rewards/margins": -1.8571124076843262, "rewards/rejected": 8.407811164855957, "step": 1570 }, { "epoch": 0.35, "learning_rate": 9.467656616655636e-06, "logits/chosen": -1.3002327680587769, "logits/rejected": -1.0816878080368042, "logps/chosen": -97.28289794921875, "logps/rejected": -21.66572380065918, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": 5.2534074783325195, "rewards/margins": 4.809439659118652, "rewards/rejected": 0.44396764039993286, "step": 1571 }, { "epoch": 0.35, "learning_rate": 9.466851572779388e-06, "logits/chosen": -1.2849411964416504, "logits/rejected": -1.2353447675704956, "logps/chosen": -64.71686553955078, "logps/rejected": -82.21200561523438, "loss": 0.3435, "rewards/accuracies": 1.0, "rewards/chosen": 3.1225106716156006, "rewards/margins": 0.5410211086273193, "rewards/rejected": 2.5814895629882812, "step": 1572 }, { "epoch": 0.35, "learning_rate": 9.46604595492345e-06, "logits/chosen": -1.4924869537353516, "logits/rejected": -1.3985923528671265, "logps/chosen": -54.42180633544922, "logps/rejected": -48.35475158691406, "loss": 0.4553, "rewards/accuracies": 0.0, "rewards/chosen": 1.8668091297149658, "rewards/margins": -0.3786582946777344, "rewards/rejected": 2.2454674243927, "step": 1573 }, { "epoch": 0.35, "learning_rate": 9.465239763191345e-06, "logits/chosen": -1.2930665016174316, "logits/rejected": -1.2715635299682617, "logps/chosen": -34.3011474609375, "logps/rejected": -31.11857795715332, "loss": 0.2421, "rewards/accuracies": 1.0, "rewards/chosen": 2.9166817665100098, "rewards/margins": 0.4805185794830322, "rewards/rejected": 2.4361631870269775, "step": 1574 }, { "epoch": 0.35, "learning_rate": 9.464432997686664e-06, "logits/chosen": -1.6548340320587158, "logits/rejected": -1.637711763381958, "logps/chosen": -78.05726623535156, "logps/rejected": -80.77285766601562, "loss": 1.4587, "rewards/accuracies": 1.0, "rewards/chosen": 5.355825901031494, "rewards/margins": 0.8544549942016602, "rewards/rejected": 4.501370906829834, "step": 1575 }, { "epoch": 0.35, "learning_rate": 9.463625658513073e-06, "logits/chosen": -0.9539737105369568, "logits/rejected": -0.9600383639335632, "logps/chosen": -108.57056427001953, "logps/rejected": -114.48637390136719, "loss": 1.3732, "rewards/accuracies": 0.0, "rewards/chosen": 5.956459999084473, "rewards/margins": -1.9160027503967285, "rewards/rejected": 7.872462749481201, "step": 1576 }, { "epoch": 0.35, "learning_rate": 9.462817745774316e-06, "logits/chosen": -1.3683472871780396, "logits/rejected": -1.3664120435714722, "logps/chosen": -72.29164123535156, "logps/rejected": -60.52323913574219, "loss": 1.165, "rewards/accuracies": 1.0, "rewards/chosen": 4.060588359832764, "rewards/margins": 0.12042784690856934, "rewards/rejected": 3.9401605129241943, "step": 1577 }, { "epoch": 0.35, "learning_rate": 9.462009259574207e-06, "logits/chosen": -1.6725236177444458, "logits/rejected": -1.6434355974197388, "logps/chosen": -47.79510498046875, "logps/rejected": -59.60205078125, "loss": 0.5239, "rewards/accuracies": 1.0, "rewards/chosen": 2.985811710357666, "rewards/margins": 0.2586686611175537, "rewards/rejected": 2.7271430492401123, "step": 1578 }, { "epoch": 0.35, "learning_rate": 9.461200200016636e-06, "logits/chosen": -1.3556272983551025, "logits/rejected": -1.3161636590957642, "logps/chosen": -38.30289840698242, "logps/rejected": -63.36731719970703, "loss": 0.3525, "rewards/accuracies": 1.0, "rewards/chosen": 2.5859029293060303, "rewards/margins": 0.011483430862426758, "rewards/rejected": 2.5744194984436035, "step": 1579 }, { "epoch": 0.35, "learning_rate": 9.460390567205562e-06, "logits/chosen": -1.8004882335662842, "logits/rejected": -1.6328452825546265, "logps/chosen": -133.48826599121094, "logps/rejected": -46.70936584472656, "loss": 0.2673, "rewards/accuracies": 1.0, "rewards/chosen": 6.924037456512451, "rewards/margins": 4.014584541320801, "rewards/rejected": 2.9094529151916504, "step": 1580 }, { "epoch": 0.35, "learning_rate": 9.459580361245024e-06, "logits/chosen": -1.4910882711410522, "logits/rejected": -1.4251956939697266, "logps/chosen": -78.12076568603516, "logps/rejected": -70.40614318847656, "loss": 0.1349, "rewards/accuracies": 1.0, "rewards/chosen": 3.433208465576172, "rewards/margins": 1.196812391281128, "rewards/rejected": 2.236396074295044, "step": 1581 }, { "epoch": 0.35, "learning_rate": 9.458769582239128e-06, "logits/chosen": -1.3208736181259155, "logits/rejected": -1.269209861755371, "logps/chosen": -103.72257995605469, "logps/rejected": -48.93341827392578, "loss": 0.2737, "rewards/accuracies": 1.0, "rewards/chosen": 4.99334716796875, "rewards/margins": 2.5239951610565186, "rewards/rejected": 2.4693520069122314, "step": 1582 }, { "epoch": 0.35, "learning_rate": 9.457958230292061e-06, "logits/chosen": -1.407687783241272, "logits/rejected": -1.4152394533157349, "logps/chosen": -86.19320678710938, "logps/rejected": -53.58858108520508, "loss": 1.2649, "rewards/accuracies": 0.0, "rewards/chosen": 2.738806962966919, "rewards/margins": -0.013718128204345703, "rewards/rejected": 2.7525250911712646, "step": 1583 }, { "epoch": 0.35, "learning_rate": 9.457146305508078e-06, "logits/chosen": -1.321175456047058, "logits/rejected": -1.19580078125, "logps/chosen": -30.223613739013672, "logps/rejected": -43.79033660888672, "loss": 0.5709, "rewards/accuracies": 0.0, "rewards/chosen": 1.0131572484970093, "rewards/margins": -0.6966396570205688, "rewards/rejected": 1.7097969055175781, "step": 1584 }, { "epoch": 0.35, "learning_rate": 9.45633380799151e-06, "logits/chosen": -1.501914381980896, "logits/rejected": -1.501914381980896, "logps/chosen": -65.66998291015625, "logps/rejected": -65.66998291015625, "loss": 0.3706, "rewards/accuracies": 0.0, "rewards/chosen": 3.8953590393066406, "rewards/margins": 0.0, "rewards/rejected": 3.8953590393066406, "step": 1585 }, { "epoch": 0.35, "learning_rate": 9.455520737846757e-06, "logits/chosen": -1.129618763923645, "logits/rejected": -1.138429880142212, "logps/chosen": -87.0675048828125, "logps/rejected": -92.40597534179688, "loss": 1.141, "rewards/accuracies": 0.0, "rewards/chosen": 2.422947645187378, "rewards/margins": -2.131967306137085, "rewards/rejected": 4.554914951324463, "step": 1586 }, { "epoch": 0.35, "learning_rate": 9.454707095178304e-06, "logits/chosen": -1.6792279481887817, "logits/rejected": -1.5607718229293823, "logps/chosen": -147.89752197265625, "logps/rejected": -61.350990295410156, "loss": 0.0811, "rewards/accuracies": 1.0, "rewards/chosen": 6.127905368804932, "rewards/margins": 3.908129930496216, "rewards/rejected": 2.219775438308716, "step": 1587 }, { "epoch": 0.35, "learning_rate": 9.453892880090696e-06, "logits/chosen": -1.5368118286132812, "logits/rejected": -1.5047587156295776, "logps/chosen": -64.56390380859375, "logps/rejected": -67.90739440917969, "loss": 0.556, "rewards/accuracies": 0.0, "rewards/chosen": 2.9596755504608154, "rewards/margins": -0.34283924102783203, "rewards/rejected": 3.3025147914886475, "step": 1588 }, { "epoch": 0.35, "learning_rate": 9.45307809268856e-06, "logits/chosen": -1.190925121307373, "logits/rejected": -1.1911488771438599, "logps/chosen": -57.04813003540039, "logps/rejected": -58.775081634521484, "loss": 3.4331, "rewards/accuracies": 0.0, "rewards/chosen": 1.3841320276260376, "rewards/margins": -1.899023413658142, "rewards/rejected": 3.2831554412841797, "step": 1589 }, { "epoch": 0.35, "learning_rate": 9.452262733076594e-06, "logits/chosen": -1.0546510219573975, "logits/rejected": -1.0708667039871216, "logps/chosen": -23.878284454345703, "logps/rejected": -59.028411865234375, "loss": 0.2078, "rewards/accuracies": 1.0, "rewards/chosen": 1.5898380279541016, "rewards/margins": 0.6692951321601868, "rewards/rejected": 0.9205428957939148, "step": 1590 }, { "epoch": 0.35, "learning_rate": 9.45144680135957e-06, "logits/chosen": -1.5487797260284424, "logits/rejected": -1.4715627431869507, "logps/chosen": -93.67872619628906, "logps/rejected": -122.20881652832031, "loss": 0.1276, "rewards/accuracies": 1.0, "rewards/chosen": 4.752284526824951, "rewards/margins": 1.275517463684082, "rewards/rejected": 3.476767063140869, "step": 1591 }, { "epoch": 0.35, "learning_rate": 9.450630297642334e-06, "logits/chosen": -1.3144422769546509, "logits/rejected": -1.3804082870483398, "logps/chosen": -58.99395751953125, "logps/rejected": -144.44229125976562, "loss": 1.7594, "rewards/accuracies": 0.0, "rewards/chosen": 3.227341413497925, "rewards/margins": -2.4597465991973877, "rewards/rejected": 5.6870880126953125, "step": 1592 }, { "epoch": 0.35, "learning_rate": 9.449813222029802e-06, "logits/chosen": -1.6698123216629028, "logits/rejected": -1.4297842979431152, "logps/chosen": -94.21990966796875, "logps/rejected": -34.895179748535156, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": 6.9376115798950195, "rewards/margins": 3.8663971424102783, "rewards/rejected": 3.071214437484741, "step": 1593 }, { "epoch": 0.35, "learning_rate": 9.448995574626969e-06, "logits/chosen": -1.6390676498413086, "logits/rejected": -1.6723827123641968, "logps/chosen": -91.73702239990234, "logps/rejected": -120.98081970214844, "loss": 0.9293, "rewards/accuracies": 0.0, "rewards/chosen": 5.192507266998291, "rewards/margins": -1.5989737510681152, "rewards/rejected": 6.791481018066406, "step": 1594 }, { "epoch": 0.35, "learning_rate": 9.448177355538899e-06, "logits/chosen": -1.9326361417770386, "logits/rejected": -1.856391429901123, "logps/chosen": -72.31085968017578, "logps/rejected": -72.28273010253906, "loss": 1.0392, "rewards/accuracies": 1.0, "rewards/chosen": 1.8325470685958862, "rewards/margins": 0.22425079345703125, "rewards/rejected": 1.608296275138855, "step": 1595 }, { "epoch": 0.35, "learning_rate": 9.447358564870732e-06, "logits/chosen": -1.3240270614624023, "logits/rejected": -1.2914375066757202, "logps/chosen": -104.0347900390625, "logps/rejected": -122.59278106689453, "loss": 0.7842, "rewards/accuracies": 0.0, "rewards/chosen": 3.790339708328247, "rewards/margins": -0.9581153392791748, "rewards/rejected": 4.748455047607422, "step": 1596 }, { "epoch": 0.35, "learning_rate": 9.446539202727683e-06, "logits/chosen": -1.6092380285263062, "logits/rejected": -1.5677671432495117, "logps/chosen": -115.08222961425781, "logps/rejected": -71.79393005371094, "loss": 0.2566, "rewards/accuracies": 1.0, "rewards/chosen": 6.547085762023926, "rewards/margins": 2.9710488319396973, "rewards/rejected": 3.5760369300842285, "step": 1597 }, { "epoch": 0.35, "learning_rate": 9.445719269215032e-06, "logits/chosen": -1.440191626548767, "logits/rejected": -1.2878096103668213, "logps/chosen": -163.6593780517578, "logps/rejected": -45.253623962402344, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 6.718899726867676, "rewards/margins": 5.532203674316406, "rewards/rejected": 1.18669593334198, "step": 1598 }, { "epoch": 0.35, "learning_rate": 9.444898764438144e-06, "logits/chosen": -1.4885283708572388, "logits/rejected": -1.5699052810668945, "logps/chosen": -107.32077026367188, "logps/rejected": -94.41140747070312, "loss": 1.3633, "rewards/accuracies": 0.0, "rewards/chosen": 5.090867519378662, "rewards/margins": -0.1418018341064453, "rewards/rejected": 5.232669353485107, "step": 1599 }, { "epoch": 0.35, "learning_rate": 9.444077688502451e-06, "logits/chosen": -1.5463682413101196, "logits/rejected": -1.4702796936035156, "logps/chosen": -111.67045593261719, "logps/rejected": -53.599082946777344, "loss": 0.2588, "rewards/accuracies": 1.0, "rewards/chosen": 4.33204984664917, "rewards/margins": 1.7411525249481201, "rewards/rejected": 2.59089732170105, "step": 1600 }, { "epoch": 0.35, "learning_rate": 9.443256041513457e-06, "logits/chosen": -1.271876335144043, "logits/rejected": -1.2402735948562622, "logps/chosen": -83.248046875, "logps/rejected": -80.6487808227539, "loss": 1.852, "rewards/accuracies": 0.0, "rewards/chosen": 2.387652635574341, "rewards/margins": -1.4478569030761719, "rewards/rejected": 3.8355095386505127, "step": 1601 }, { "epoch": 0.35, "learning_rate": 9.442433823576741e-06, "logits/chosen": -1.433885097503662, "logits/rejected": -1.498376488685608, "logps/chosen": -43.29429626464844, "logps/rejected": -75.8402099609375, "loss": 1.6239, "rewards/accuracies": 0.0, "rewards/chosen": 1.6094554662704468, "rewards/margins": -2.915289878845215, "rewards/rejected": 4.524745464324951, "step": 1602 }, { "epoch": 0.35, "learning_rate": 9.441611034797961e-06, "logits/chosen": -1.5240803956985474, "logits/rejected": -1.4903148412704468, "logps/chosen": -175.75790405273438, "logps/rejected": -148.4400177001953, "loss": 1.5958, "rewards/accuracies": 1.0, "rewards/chosen": 8.194091796875, "rewards/margins": 0.2940382957458496, "rewards/rejected": 7.90005350112915, "step": 1603 }, { "epoch": 0.36, "learning_rate": 9.44078767528284e-06, "logits/chosen": -1.6015294790267944, "logits/rejected": -1.5830978155136108, "logps/chosen": -95.20165252685547, "logps/rejected": -155.06101989746094, "loss": 2.6412, "rewards/accuracies": 0.0, "rewards/chosen": 9.248435020446777, "rewards/margins": -1.3975963592529297, "rewards/rejected": 10.646031379699707, "step": 1604 }, { "epoch": 0.36, "learning_rate": 9.439963745137177e-06, "logits/chosen": -1.972532868385315, "logits/rejected": -1.996561884880066, "logps/chosen": -54.29821014404297, "logps/rejected": -63.186073303222656, "loss": 1.8116, "rewards/accuracies": 0.0, "rewards/chosen": 2.8507516384124756, "rewards/margins": -2.893824815750122, "rewards/rejected": 5.744576454162598, "step": 1605 }, { "epoch": 0.36, "learning_rate": 9.439139244466847e-06, "logits/chosen": -1.6909950971603394, "logits/rejected": -1.579647183418274, "logps/chosen": -111.70413208007812, "logps/rejected": -88.64369201660156, "loss": 0.4719, "rewards/accuracies": 1.0, "rewards/chosen": 6.888497829437256, "rewards/margins": 0.3885817527770996, "rewards/rejected": 6.499916076660156, "step": 1606 }, { "epoch": 0.36, "learning_rate": 9.438314173377796e-06, "logits/chosen": -1.6307759284973145, "logits/rejected": -1.6223019361495972, "logps/chosen": -47.184200286865234, "logps/rejected": -59.25328063964844, "loss": 0.8369, "rewards/accuracies": 0.0, "rewards/chosen": 3.4842259883880615, "rewards/margins": -1.4146907329559326, "rewards/rejected": 4.898916721343994, "step": 1607 }, { "epoch": 0.36, "learning_rate": 9.437488531976042e-06, "logits/chosen": -1.5499135255813599, "logits/rejected": -1.3372547626495361, "logps/chosen": -121.18612670898438, "logps/rejected": -50.60017395019531, "loss": 0.868, "rewards/accuracies": 1.0, "rewards/chosen": 5.072549343109131, "rewards/margins": 1.2808446884155273, "rewards/rejected": 3.7917046546936035, "step": 1608 }, { "epoch": 0.36, "learning_rate": 9.43666232036768e-06, "logits/chosen": -1.4708846807479858, "logits/rejected": -1.3905925750732422, "logps/chosen": -59.2213020324707, "logps/rejected": -62.79195022583008, "loss": 0.5646, "rewards/accuracies": 0.0, "rewards/chosen": 1.6450130939483643, "rewards/margins": -0.6485335826873779, "rewards/rejected": 2.293546676635742, "step": 1609 }, { "epoch": 0.36, "learning_rate": 9.435835538658873e-06, "logits/chosen": -1.3118174076080322, "logits/rejected": -1.3401150703430176, "logps/chosen": -40.48417663574219, "logps/rejected": -125.18262481689453, "loss": 1.0563, "rewards/accuracies": 0.0, "rewards/chosen": 4.713367462158203, "rewards/margins": -1.9818129539489746, "rewards/rejected": 6.695180416107178, "step": 1610 }, { "epoch": 0.36, "learning_rate": 9.435008186955866e-06, "logits/chosen": -1.5806149244308472, "logits/rejected": -1.5129525661468506, "logps/chosen": -84.66687774658203, "logps/rejected": -62.365013122558594, "loss": 0.9526, "rewards/accuracies": 0.0, "rewards/chosen": 2.8809471130371094, "rewards/margins": -0.2445068359375, "rewards/rejected": 3.1254539489746094, "step": 1611 }, { "epoch": 0.36, "learning_rate": 9.434180265364965e-06, "logits/chosen": -1.398557424545288, "logits/rejected": -1.2767466306686401, "logps/chosen": -37.48012161254883, "logps/rejected": -70.3405532836914, "loss": 1.5628, "rewards/accuracies": 0.0, "rewards/chosen": 2.2482259273529053, "rewards/margins": -2.4643895626068115, "rewards/rejected": 4.712615489959717, "step": 1612 }, { "epoch": 0.36, "learning_rate": 9.43335177399256e-06, "logits/chosen": -1.3992294073104858, "logits/rejected": -1.3013108968734741, "logps/chosen": -81.26905822753906, "logps/rejected": -36.620811462402344, "loss": 0.2488, "rewards/accuracies": 1.0, "rewards/chosen": 3.895688772201538, "rewards/margins": 0.790985107421875, "rewards/rejected": 3.104703664779663, "step": 1613 }, { "epoch": 0.36, "learning_rate": 9.432522712945111e-06, "logits/chosen": -1.3033584356307983, "logits/rejected": -1.3033584356307983, "logps/chosen": -42.98396682739258, "logps/rejected": -42.98396682739258, "loss": 0.3619, "rewards/accuracies": 0.0, "rewards/chosen": 2.725767135620117, "rewards/margins": 0.0, "rewards/rejected": 2.725767135620117, "step": 1614 }, { "epoch": 0.36, "learning_rate": 9.43169308232915e-06, "logits/chosen": -1.3781355619430542, "logits/rejected": -1.4147419929504395, "logps/chosen": -117.8520736694336, "logps/rejected": -111.8578872680664, "loss": 0.9582, "rewards/accuracies": 0.0, "rewards/chosen": 4.292551517486572, "rewards/margins": -1.7547883987426758, "rewards/rejected": 6.047339916229248, "step": 1615 }, { "epoch": 0.36, "learning_rate": 9.430862882251279e-06, "logits/chosen": -1.512007474899292, "logits/rejected": -1.6244157552719116, "logps/chosen": -70.55860137939453, "logps/rejected": -68.66659545898438, "loss": 2.529, "rewards/accuracies": 0.0, "rewards/chosen": 1.8004356622695923, "rewards/margins": -4.671724796295166, "rewards/rejected": 6.472160339355469, "step": 1616 }, { "epoch": 0.36, "learning_rate": 9.430032112818182e-06, "logits/chosen": -1.4421311616897583, "logits/rejected": -1.3276498317718506, "logps/chosen": -112.88006591796875, "logps/rejected": -49.921260833740234, "loss": 0.0663, "rewards/accuracies": 1.0, "rewards/chosen": 4.645169258117676, "rewards/margins": 2.034275531768799, "rewards/rejected": 2.610893726348877, "step": 1617 }, { "epoch": 0.36, "learning_rate": 9.429200774136603e-06, "logits/chosen": -1.3175790309906006, "logits/rejected": -1.1426881551742554, "logps/chosen": -36.21543884277344, "logps/rejected": -46.96845626831055, "loss": 0.6244, "rewards/accuracies": 1.0, "rewards/chosen": 2.9858551025390625, "rewards/margins": 2.0133659839630127, "rewards/rejected": 0.9724891781806946, "step": 1618 }, { "epoch": 0.36, "learning_rate": 9.428368866313377e-06, "logits/chosen": -1.4167430400848389, "logits/rejected": -1.066199779510498, "logps/chosen": -92.7978515625, "logps/rejected": -203.7630157470703, "loss": 4.2135, "rewards/accuracies": 0.0, "rewards/chosen": 1.2900711297988892, "rewards/margins": -8.420022010803223, "rewards/rejected": 9.71009349822998, "step": 1619 }, { "epoch": 0.36, "learning_rate": 9.427536389455394e-06, "logits/chosen": -1.3841723203659058, "logits/rejected": -1.4182567596435547, "logps/chosen": -44.440818786621094, "logps/rejected": -82.08712005615234, "loss": 0.4468, "rewards/accuracies": 1.0, "rewards/chosen": 2.8596198558807373, "rewards/margins": 1.0382628440856934, "rewards/rejected": 1.821357011795044, "step": 1620 }, { "epoch": 0.36, "learning_rate": 9.426703343669631e-06, "logits/chosen": -1.246617317199707, "logits/rejected": -1.2475115060806274, "logps/chosen": -53.11444091796875, "logps/rejected": -42.999961853027344, "loss": 0.5336, "rewards/accuracies": 1.0, "rewards/chosen": 2.7176361083984375, "rewards/margins": 0.11632990837097168, "rewards/rejected": 2.601306200027466, "step": 1621 }, { "epoch": 0.36, "learning_rate": 9.425869729063129e-06, "logits/chosen": -1.6914548873901367, "logits/rejected": -1.7061669826507568, "logps/chosen": -79.94479370117188, "logps/rejected": -83.50886535644531, "loss": 1.0146, "rewards/accuracies": 0.0, "rewards/chosen": 2.0763962268829346, "rewards/margins": -1.6564888954162598, "rewards/rejected": 3.7328851222991943, "step": 1622 }, { "epoch": 0.36, "learning_rate": 9.425035545743005e-06, "logits/chosen": -1.4387986660003662, "logits/rejected": -1.4197282791137695, "logps/chosen": -45.226173400878906, "logps/rejected": -47.0131950378418, "loss": 0.7767, "rewards/accuracies": 1.0, "rewards/chosen": 3.498544454574585, "rewards/margins": 1.8721528053283691, "rewards/rejected": 1.6263916492462158, "step": 1623 }, { "epoch": 0.36, "learning_rate": 9.424200793816451e-06, "logits/chosen": -1.7111806869506836, "logits/rejected": -1.6186896562576294, "logps/chosen": -86.98237609863281, "logps/rejected": -28.301841735839844, "loss": 0.1095, "rewards/accuracies": 1.0, "rewards/chosen": 4.36146879196167, "rewards/margins": 2.4379711151123047, "rewards/rejected": 1.9234977960586548, "step": 1624 }, { "epoch": 0.36, "learning_rate": 9.423365473390734e-06, "logits/chosen": -1.4594769477844238, "logits/rejected": -1.4594769477844238, "logps/chosen": -43.904720306396484, "logps/rejected": -43.904720306396484, "loss": 0.5976, "rewards/accuracies": 0.0, "rewards/chosen": 2.5832011699676514, "rewards/margins": 0.0, "rewards/rejected": 2.5832011699676514, "step": 1625 }, { "epoch": 0.36, "learning_rate": 9.422529584573183e-06, "logits/chosen": -1.5049084424972534, "logits/rejected": -1.5016896724700928, "logps/chosen": -103.55816650390625, "logps/rejected": -47.07318115234375, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 5.881478786468506, "rewards/margins": 3.7061994075775146, "rewards/rejected": 2.175279378890991, "step": 1626 }, { "epoch": 0.36, "learning_rate": 9.421693127471214e-06, "logits/chosen": -1.2846044301986694, "logits/rejected": -1.1345127820968628, "logps/chosen": -41.869930267333984, "logps/rejected": -6.588280200958252, "loss": 0.3683, "rewards/accuracies": 1.0, "rewards/chosen": 2.335505247116089, "rewards/margins": 1.1413120031356812, "rewards/rejected": 1.1941932439804077, "step": 1627 }, { "epoch": 0.36, "learning_rate": 9.420856102192305e-06, "logits/chosen": -1.2271496057510376, "logits/rejected": -1.3548756837844849, "logps/chosen": -38.326778411865234, "logps/rejected": -33.27587890625, "loss": 1.2107, "rewards/accuracies": 0.0, "rewards/chosen": 1.1191341876983643, "rewards/margins": -1.3967304229736328, "rewards/rejected": 2.515864610671997, "step": 1628 }, { "epoch": 0.36, "learning_rate": 9.420018508844017e-06, "logits/chosen": -1.0816768407821655, "logits/rejected": -1.043563961982727, "logps/chosen": -45.86164855957031, "logps/rejected": -79.02775573730469, "loss": 2.8863, "rewards/accuracies": 0.0, "rewards/chosen": 0.9153060913085938, "rewards/margins": -3.2095727920532227, "rewards/rejected": 4.124878883361816, "step": 1629 }, { "epoch": 0.36, "learning_rate": 9.419180347533976e-06, "logits/chosen": -1.4052759408950806, "logits/rejected": -1.3919216394424438, "logps/chosen": -67.2569351196289, "logps/rejected": -71.96768951416016, "loss": 0.6977, "rewards/accuracies": 0.0, "rewards/chosen": 3.0977089405059814, "rewards/margins": -1.0824949741363525, "rewards/rejected": 4.180203914642334, "step": 1630 }, { "epoch": 0.36, "learning_rate": 9.418341618369882e-06, "logits/chosen": -1.2981611490249634, "logits/rejected": -1.1844606399536133, "logps/chosen": -158.64283752441406, "logps/rejected": -125.13447570800781, "loss": 0.0832, "rewards/accuracies": 1.0, "rewards/chosen": 9.383070945739746, "rewards/margins": 1.7431492805480957, "rewards/rejected": 7.63992166519165, "step": 1631 }, { "epoch": 0.36, "learning_rate": 9.417502321459513e-06, "logits/chosen": -1.3065747022628784, "logits/rejected": -1.2274221181869507, "logps/chosen": -58.13310241699219, "logps/rejected": -12.080585479736328, "loss": 0.4665, "rewards/accuracies": 1.0, "rewards/chosen": 1.378211259841919, "rewards/margins": 0.6146948337554932, "rewards/rejected": 0.7635164260864258, "step": 1632 }, { "epoch": 0.36, "learning_rate": 9.416662456910714e-06, "logits/chosen": -1.4494634866714478, "logits/rejected": -1.5000640153884888, "logps/chosen": -116.74427795410156, "logps/rejected": -103.10464477539062, "loss": 1.2043, "rewards/accuracies": 0.0, "rewards/chosen": 6.568115234375, "rewards/margins": -1.6344423294067383, "rewards/rejected": 8.202557563781738, "step": 1633 }, { "epoch": 0.36, "learning_rate": 9.415822024831407e-06, "logits/chosen": -1.4511579275131226, "logits/rejected": -1.4098374843597412, "logps/chosen": -92.39849090576172, "logps/rejected": -92.42723083496094, "loss": 0.3364, "rewards/accuracies": 1.0, "rewards/chosen": 5.298436641693115, "rewards/margins": 1.345609188079834, "rewards/rejected": 3.9528274536132812, "step": 1634 }, { "epoch": 0.36, "learning_rate": 9.414981025329585e-06, "logits/chosen": -1.3062998056411743, "logits/rejected": -1.1488224267959595, "logps/chosen": -97.58484649658203, "logps/rejected": -41.58125305175781, "loss": 0.3101, "rewards/accuracies": 1.0, "rewards/chosen": 4.875363826751709, "rewards/margins": 2.3912999629974365, "rewards/rejected": 2.4840638637542725, "step": 1635 }, { "epoch": 0.36, "learning_rate": 9.414139458513316e-06, "logits/chosen": -1.2767295837402344, "logits/rejected": -1.2378244400024414, "logps/chosen": -69.78543853759766, "logps/rejected": -37.21136474609375, "loss": 1.0646, "rewards/accuracies": 0.0, "rewards/chosen": 1.8813339471817017, "rewards/margins": -1.5048409700393677, "rewards/rejected": 3.3861749172210693, "step": 1636 }, { "epoch": 0.36, "learning_rate": 9.413297324490736e-06, "logits/chosen": -1.7624080181121826, "logits/rejected": -1.7081151008605957, "logps/chosen": -39.66276550292969, "logps/rejected": -78.4493408203125, "loss": 0.7193, "rewards/accuracies": 0.0, "rewards/chosen": 1.314369559288025, "rewards/margins": -0.9314054250717163, "rewards/rejected": 2.245774984359741, "step": 1637 }, { "epoch": 0.36, "learning_rate": 9.41245462337006e-06, "logits/chosen": -1.5086665153503418, "logits/rejected": -1.490947961807251, "logps/chosen": -97.61021423339844, "logps/rejected": -82.28643798828125, "loss": 1.6139, "rewards/accuracies": 1.0, "rewards/chosen": 5.632388591766357, "rewards/margins": 1.0431671142578125, "rewards/rejected": 4.589221477508545, "step": 1638 }, { "epoch": 0.36, "learning_rate": 9.41161135525957e-06, "logits/chosen": -1.4877569675445557, "logits/rejected": -1.4905390739440918, "logps/chosen": -121.90408325195312, "logps/rejected": -123.8403091430664, "loss": 1.8857, "rewards/accuracies": 1.0, "rewards/chosen": 6.655097961425781, "rewards/margins": 0.15700912475585938, "rewards/rejected": 6.498088836669922, "step": 1639 }, { "epoch": 0.36, "learning_rate": 9.410767520267629e-06, "logits/chosen": -1.2309396266937256, "logits/rejected": -1.1260685920715332, "logps/chosen": -39.56924057006836, "logps/rejected": -19.22447395324707, "loss": 0.1804, "rewards/accuracies": 1.0, "rewards/chosen": 1.8284481763839722, "rewards/margins": 0.8387722373008728, "rewards/rejected": 0.9896759390830994, "step": 1640 }, { "epoch": 0.36, "learning_rate": 9.409923118502665e-06, "logits/chosen": -1.5497008562088013, "logits/rejected": -1.3645402193069458, "logps/chosen": -123.7086181640625, "logps/rejected": -74.30952453613281, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 7.030247688293457, "rewards/margins": 2.525259494781494, "rewards/rejected": 4.504988193511963, "step": 1641 }, { "epoch": 0.36, "learning_rate": 9.40907815007318e-06, "logits/chosen": -1.6622751951217651, "logits/rejected": -1.5197932720184326, "logps/chosen": -136.03994750976562, "logps/rejected": -71.58126831054688, "loss": 2.0503, "rewards/accuracies": 1.0, "rewards/chosen": 7.188244819641113, "rewards/margins": 3.177964687347412, "rewards/rejected": 4.010280132293701, "step": 1642 }, { "epoch": 0.36, "learning_rate": 9.408232615087752e-06, "logits/chosen": -1.652606725692749, "logits/rejected": -1.5973877906799316, "logps/chosen": -70.9180679321289, "logps/rejected": -63.617977142333984, "loss": 1.835, "rewards/accuracies": 0.0, "rewards/chosen": 1.134394884109497, "rewards/margins": -2.379481077194214, "rewards/rejected": 3.513875961303711, "step": 1643 }, { "epoch": 0.36, "learning_rate": 9.40738651365503e-06, "logits/chosen": -1.015020489692688, "logits/rejected": -1.015020489692688, "logps/chosen": -22.1294002532959, "logps/rejected": -22.1294002532959, "loss": 0.466, "rewards/accuracies": 0.0, "rewards/chosen": 1.8465906381607056, "rewards/margins": 0.0, "rewards/rejected": 1.8465906381607056, "step": 1644 }, { "epoch": 0.36, "learning_rate": 9.406539845883736e-06, "logits/chosen": -1.5244369506835938, "logits/rejected": -1.4779243469238281, "logps/chosen": -37.72024154663086, "logps/rejected": -57.88620376586914, "loss": 0.8726, "rewards/accuracies": 0.0, "rewards/chosen": 2.6711666584014893, "rewards/margins": -1.54640793800354, "rewards/rejected": 4.217574596405029, "step": 1645 }, { "epoch": 0.36, "learning_rate": 9.405692611882666e-06, "logits/chosen": -1.0061073303222656, "logits/rejected": -1.0849189758300781, "logps/chosen": -22.695934295654297, "logps/rejected": -78.89356994628906, "loss": 2.0309, "rewards/accuracies": 0.0, "rewards/chosen": 1.1872256994247437, "rewards/margins": -3.9794626235961914, "rewards/rejected": 5.166688442230225, "step": 1646 }, { "epoch": 0.36, "learning_rate": 9.404844811760685e-06, "logits/chosen": -1.4839030504226685, "logits/rejected": -1.4384812116622925, "logps/chosen": -89.07917785644531, "logps/rejected": -75.98829650878906, "loss": 0.3973, "rewards/accuracies": 0.0, "rewards/chosen": 4.740268230438232, "rewards/margins": -0.15088272094726562, "rewards/rejected": 4.891150951385498, "step": 1647 }, { "epoch": 0.36, "learning_rate": 9.403996445626735e-06, "logits/chosen": -1.288264513015747, "logits/rejected": -1.3400272130966187, "logps/chosen": -67.43830871582031, "logps/rejected": -110.28431701660156, "loss": 0.6431, "rewards/accuracies": 0.0, "rewards/chosen": 2.4212372303009033, "rewards/margins": -0.9558913707733154, "rewards/rejected": 3.3771286010742188, "step": 1648 }, { "epoch": 0.36, "learning_rate": 9.403147513589829e-06, "logits/chosen": -1.5876315832138062, "logits/rejected": -1.4512325525283813, "logps/chosen": -90.9709243774414, "logps/rejected": -85.26353454589844, "loss": 0.4797, "rewards/accuracies": 0.0, "rewards/chosen": 6.269191741943359, "rewards/margins": -0.256716251373291, "rewards/rejected": 6.52590799331665, "step": 1649 }, { "epoch": 0.37, "learning_rate": 9.402298015759052e-06, "logits/chosen": -1.1912965774536133, "logits/rejected": -1.118979573249817, "logps/chosen": -95.31568145751953, "logps/rejected": -71.92991638183594, "loss": 0.5722, "rewards/accuracies": 1.0, "rewards/chosen": 5.1035895347595215, "rewards/margins": 0.37772274017333984, "rewards/rejected": 4.725866794586182, "step": 1650 }, { "epoch": 0.37, "learning_rate": 9.401447952243563e-06, "logits/chosen": -1.2146711349487305, "logits/rejected": -1.404495120048523, "logps/chosen": -79.59697723388672, "logps/rejected": -48.540122985839844, "loss": 1.4306, "rewards/accuracies": 0.0, "rewards/chosen": 3.0358848571777344, "rewards/margins": -0.30971693992614746, "rewards/rejected": 3.345601797103882, "step": 1651 }, { "epoch": 0.37, "learning_rate": 9.400597323152591e-06, "logits/chosen": -1.2197991609573364, "logits/rejected": -1.0370811223983765, "logps/chosen": -46.90243148803711, "logps/rejected": -20.720712661743164, "loss": 0.2506, "rewards/accuracies": 1.0, "rewards/chosen": 1.2644115686416626, "rewards/margins": 0.4646947979927063, "rewards/rejected": 0.7997167706489563, "step": 1652 }, { "epoch": 0.37, "learning_rate": 9.399746128595444e-06, "logits/chosen": -1.28821861743927, "logits/rejected": -1.34111750125885, "logps/chosen": -119.42645263671875, "logps/rejected": -103.76297760009766, "loss": 1.0969, "rewards/accuracies": 0.0, "rewards/chosen": 4.824575901031494, "rewards/margins": -1.900357723236084, "rewards/rejected": 6.724933624267578, "step": 1653 }, { "epoch": 0.37, "learning_rate": 9.398894368681496e-06, "logits/chosen": -1.5032469034194946, "logits/rejected": -1.409180998802185, "logps/chosen": -133.98968505859375, "logps/rejected": -77.21994018554688, "loss": 0.7397, "rewards/accuracies": 1.0, "rewards/chosen": 6.186242580413818, "rewards/margins": 2.034891128540039, "rewards/rejected": 4.151351451873779, "step": 1654 }, { "epoch": 0.37, "learning_rate": 9.398042043520197e-06, "logits/chosen": -1.4160072803497314, "logits/rejected": -1.3522814512252808, "logps/chosen": -55.517974853515625, "logps/rejected": -33.85663604736328, "loss": 0.1938, "rewards/accuracies": 1.0, "rewards/chosen": 2.941408634185791, "rewards/margins": 0.8100030422210693, "rewards/rejected": 2.1314055919647217, "step": 1655 }, { "epoch": 0.37, "learning_rate": 9.397189153221067e-06, "logits/chosen": -1.2762210369110107, "logits/rejected": -1.0442026853561401, "logps/chosen": -173.63233947753906, "logps/rejected": -152.34962463378906, "loss": 2.6416, "rewards/accuracies": 0.0, "rewards/chosen": 5.490852355957031, "rewards/margins": -2.626986503601074, "rewards/rejected": 8.117838859558105, "step": 1656 }, { "epoch": 0.37, "learning_rate": 9.396335697893702e-06, "logits/chosen": -1.2028604745864868, "logits/rejected": -0.977171778678894, "logps/chosen": -114.29531860351562, "logps/rejected": -16.998937606811523, "loss": 0.3249, "rewards/accuracies": 1.0, "rewards/chosen": 4.69923734664917, "rewards/margins": 4.489992141723633, "rewards/rejected": 0.20924530923366547, "step": 1657 }, { "epoch": 0.37, "learning_rate": 9.395481677647767e-06, "logits/chosen": -1.4738433361053467, "logits/rejected": -1.4463157653808594, "logps/chosen": -76.98490905761719, "logps/rejected": -110.88810729980469, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 6.569118022918701, "rewards/margins": 2.881866693496704, "rewards/rejected": 3.687251329421997, "step": 1658 }, { "epoch": 0.37, "learning_rate": 9.394627092593002e-06, "logits/chosen": -1.3653347492218018, "logits/rejected": -1.286921501159668, "logps/chosen": -107.79936218261719, "logps/rejected": -12.964882850646973, "loss": 0.1527, "rewards/accuracies": 1.0, "rewards/chosen": 4.795234680175781, "rewards/margins": 3.6570425033569336, "rewards/rejected": 1.138192057609558, "step": 1659 }, { "epoch": 0.37, "learning_rate": 9.393771942839223e-06, "logits/chosen": -1.415279507637024, "logits/rejected": -1.3496469259262085, "logps/chosen": -122.3836669921875, "logps/rejected": -98.11827850341797, "loss": 1.6462, "rewards/accuracies": 0.0, "rewards/chosen": 4.81982421875, "rewards/margins": -3.237593650817871, "rewards/rejected": 8.057417869567871, "step": 1660 }, { "epoch": 0.37, "learning_rate": 9.392916228496309e-06, "logits/chosen": -1.5883867740631104, "logits/rejected": -1.5518625974655151, "logps/chosen": -110.64129638671875, "logps/rejected": -89.4842758178711, "loss": 0.7853, "rewards/accuracies": 0.0, "rewards/chosen": 3.4304046630859375, "rewards/margins": -1.166959285736084, "rewards/rejected": 4.5973639488220215, "step": 1661 }, { "epoch": 0.37, "learning_rate": 9.392059949674222e-06, "logits/chosen": -1.2558348178863525, "logits/rejected": -1.2235791683197021, "logps/chosen": -61.3021354675293, "logps/rejected": -72.38178253173828, "loss": 0.8169, "rewards/accuracies": 0.0, "rewards/chosen": 2.0652272701263428, "rewards/margins": -1.4148967266082764, "rewards/rejected": 3.480123996734619, "step": 1662 }, { "epoch": 0.37, "learning_rate": 9.39120310648299e-06, "logits/chosen": -1.5923488140106201, "logits/rejected": -1.685053825378418, "logps/chosen": -67.10240173339844, "logps/rejected": -93.77438354492188, "loss": 0.6988, "rewards/accuracies": 0.0, "rewards/chosen": 5.437979221343994, "rewards/margins": -1.1009535789489746, "rewards/rejected": 6.538932800292969, "step": 1663 }, { "epoch": 0.37, "learning_rate": 9.390345699032712e-06, "logits/chosen": -1.5809597969055176, "logits/rejected": -1.3939224481582642, "logps/chosen": -115.17518615722656, "logps/rejected": -23.757654190063477, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 9.10026741027832, "rewards/margins": 6.9122209548950195, "rewards/rejected": 2.1880462169647217, "step": 1664 }, { "epoch": 0.37, "learning_rate": 9.389487727433569e-06, "logits/chosen": -1.7495503425598145, "logits/rejected": -1.7553707361221313, "logps/chosen": -145.1271209716797, "logps/rejected": -91.945556640625, "loss": 1.7042, "rewards/accuracies": 1.0, "rewards/chosen": 7.772390842437744, "rewards/margins": 2.7184462547302246, "rewards/rejected": 5.0539445877075195, "step": 1665 }, { "epoch": 0.37, "learning_rate": 9.388629191795804e-06, "logits/chosen": -1.354426622390747, "logits/rejected": -1.2969202995300293, "logps/chosen": -142.755859375, "logps/rejected": -80.05490112304688, "loss": 1.6818, "rewards/accuracies": 0.0, "rewards/chosen": 5.992575168609619, "rewards/margins": -2.2565064430236816, "rewards/rejected": 8.2490816116333, "step": 1666 }, { "epoch": 0.37, "learning_rate": 9.387770092229736e-06, "logits/chosen": -1.5628290176391602, "logits/rejected": -1.4600176811218262, "logps/chosen": -107.88670349121094, "logps/rejected": -90.01335144042969, "loss": 0.1068, "rewards/accuracies": 1.0, "rewards/chosen": 3.8584702014923096, "rewards/margins": 1.4477143287658691, "rewards/rejected": 2.4107558727264404, "step": 1667 }, { "epoch": 0.37, "learning_rate": 9.386910428845762e-06, "logits/chosen": -1.498231291770935, "logits/rejected": -1.4513585567474365, "logps/chosen": -34.69242477416992, "logps/rejected": -62.62677764892578, "loss": 0.3, "rewards/accuracies": 1.0, "rewards/chosen": 2.885754346847534, "rewards/margins": 0.23301315307617188, "rewards/rejected": 2.6527411937713623, "step": 1668 }, { "epoch": 0.37, "learning_rate": 9.386050201754342e-06, "logits/chosen": -1.4403693675994873, "logits/rejected": -1.3168792724609375, "logps/chosen": -95.77272033691406, "logps/rejected": -58.4859619140625, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 6.566712856292725, "rewards/margins": 2.4899234771728516, "rewards/rejected": 4.076789379119873, "step": 1669 }, { "epoch": 0.37, "learning_rate": 9.385189411066014e-06, "logits/chosen": -1.3021986484527588, "logits/rejected": -1.186257004737854, "logps/chosen": -107.45835876464844, "logps/rejected": -74.74424743652344, "loss": 0.3285, "rewards/accuracies": 1.0, "rewards/chosen": 5.314897060394287, "rewards/margins": 1.4689130783081055, "rewards/rejected": 3.8459839820861816, "step": 1670 }, { "epoch": 0.37, "learning_rate": 9.384328056891389e-06, "logits/chosen": -1.568042516708374, "logits/rejected": -1.6067272424697876, "logps/chosen": -48.858795166015625, "logps/rejected": -89.17811584472656, "loss": 2.417, "rewards/accuracies": 0.0, "rewards/chosen": 3.6010568141937256, "rewards/margins": -3.9468252658843994, "rewards/rejected": 7.547882080078125, "step": 1671 }, { "epoch": 0.37, "learning_rate": 9.38346613934115e-06, "logits/chosen": -1.3191543817520142, "logits/rejected": -1.1679205894470215, "logps/chosen": -57.0707893371582, "logps/rejected": -27.081851959228516, "loss": 0.9016, "rewards/accuracies": 1.0, "rewards/chosen": 2.4908459186553955, "rewards/margins": 2.097374677658081, "rewards/rejected": 0.3934711515903473, "step": 1672 }, { "epoch": 0.37, "learning_rate": 9.382603658526048e-06, "logits/chosen": -1.4223402738571167, "logits/rejected": -1.306073784828186, "logps/chosen": -89.00215148925781, "logps/rejected": -50.246551513671875, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 5.612770080566406, "rewards/margins": 3.653597831726074, "rewards/rejected": 1.9591721296310425, "step": 1673 }, { "epoch": 0.37, "learning_rate": 9.381740614556911e-06, "logits/chosen": -1.2160149812698364, "logits/rejected": -1.0584484338760376, "logps/chosen": -130.62335205078125, "logps/rejected": -34.702972412109375, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 7.386152744293213, "rewards/margins": 7.358439922332764, "rewards/rejected": 0.027712631970643997, "step": 1674 }, { "epoch": 0.37, "learning_rate": 9.38087700754464e-06, "logits/chosen": -1.5207120180130005, "logits/rejected": -1.2990113496780396, "logps/chosen": -91.52518463134766, "logps/rejected": -42.310733795166016, "loss": 0.1951, "rewards/accuracies": 1.0, "rewards/chosen": 3.9485023021698, "rewards/margins": 0.7413516044616699, "rewards/rejected": 3.20715069770813, "step": 1675 }, { "epoch": 0.37, "learning_rate": 9.380012837600205e-06, "logits/chosen": -1.5733951330184937, "logits/rejected": -1.5733951330184937, "logps/chosen": -56.812862396240234, "logps/rejected": -56.812862396240234, "loss": 0.3484, "rewards/accuracies": 0.0, "rewards/chosen": 3.0600452423095703, "rewards/margins": 0.0, "rewards/rejected": 3.0600452423095703, "step": 1676 }, { "epoch": 0.37, "learning_rate": 9.379148104834648e-06, "logits/chosen": -1.2499432563781738, "logits/rejected": -1.2213680744171143, "logps/chosen": -34.98754119873047, "logps/rejected": -44.78888702392578, "loss": 1.2848, "rewards/accuracies": 0.0, "rewards/chosen": 1.5454643964767456, "rewards/margins": -0.5969406366348267, "rewards/rejected": 2.1424050331115723, "step": 1677 }, { "epoch": 0.37, "learning_rate": 9.378282809359087e-06, "logits/chosen": -1.7947863340377808, "logits/rejected": -1.7207947969436646, "logps/chosen": -130.65455627441406, "logps/rejected": -24.26419448852539, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 7.754681587219238, "rewards/margins": 5.865476131439209, "rewards/rejected": 1.8892055749893188, "step": 1678 }, { "epoch": 0.37, "learning_rate": 9.377416951284712e-06, "logits/chosen": -1.326961874961853, "logits/rejected": -1.2351627349853516, "logps/chosen": -133.0974884033203, "logps/rejected": -83.88262176513672, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": 5.939949035644531, "rewards/margins": 3.9028220176696777, "rewards/rejected": 2.0371270179748535, "step": 1679 }, { "epoch": 0.37, "learning_rate": 9.376550530722778e-06, "logits/chosen": -1.4782509803771973, "logits/rejected": -1.3425469398498535, "logps/chosen": -109.45643615722656, "logps/rejected": -86.79426574707031, "loss": 0.1386, "rewards/accuracies": 1.0, "rewards/chosen": 5.5704240798950195, "rewards/margins": 1.1930556297302246, "rewards/rejected": 4.377368450164795, "step": 1680 }, { "epoch": 0.37, "learning_rate": 9.375683547784626e-06, "logits/chosen": -1.5682963132858276, "logits/rejected": -1.4699714183807373, "logps/chosen": -96.71104431152344, "logps/rejected": -63.03419876098633, "loss": 0.2532, "rewards/accuracies": 1.0, "rewards/chosen": 8.472502708435059, "rewards/margins": 5.045332908630371, "rewards/rejected": 3.4271695613861084, "step": 1681 }, { "epoch": 0.37, "learning_rate": 9.374816002581654e-06, "logits/chosen": -1.37004554271698, "logits/rejected": -1.1552026271820068, "logps/chosen": -124.27513122558594, "logps/rejected": -56.00180435180664, "loss": 0.6021, "rewards/accuracies": 1.0, "rewards/chosen": 7.281791687011719, "rewards/margins": 6.990130424499512, "rewards/rejected": 0.2916610836982727, "step": 1682 }, { "epoch": 0.37, "learning_rate": 9.373947895225345e-06, "logits/chosen": -1.5746432542800903, "logits/rejected": -1.583766222000122, "logps/chosen": -72.10795593261719, "logps/rejected": -95.07561492919922, "loss": 1.26, "rewards/accuracies": 0.0, "rewards/chosen": 2.5885987281799316, "rewards/margins": -2.3988900184631348, "rewards/rejected": 4.987488746643066, "step": 1683 }, { "epoch": 0.37, "learning_rate": 9.373079225827243e-06, "logits/chosen": -1.216171145439148, "logits/rejected": -1.245152235031128, "logps/chosen": -66.62014770507812, "logps/rejected": -78.1898422241211, "loss": 2.1587, "rewards/accuracies": 0.0, "rewards/chosen": 1.5024139881134033, "rewards/margins": -3.253488302230835, "rewards/rejected": 4.755902290344238, "step": 1684 }, { "epoch": 0.37, "learning_rate": 9.372209994498976e-06, "logits/chosen": -1.7626057863235474, "logits/rejected": -1.7213488817214966, "logps/chosen": -134.88766479492188, "logps/rejected": -42.77952575683594, "loss": 1.6862, "rewards/accuracies": 1.0, "rewards/chosen": 6.6168365478515625, "rewards/margins": 4.496038436889648, "rewards/rejected": 2.120797872543335, "step": 1685 }, { "epoch": 0.37, "learning_rate": 9.371340201352234e-06, "logits/chosen": -1.1346372365951538, "logits/rejected": -1.1010960340499878, "logps/chosen": -39.91126251220703, "logps/rejected": -37.15758514404297, "loss": 0.185, "rewards/accuracies": 1.0, "rewards/chosen": 3.196551561355591, "rewards/margins": 1.1759076118469238, "rewards/rejected": 2.020643949508667, "step": 1686 }, { "epoch": 0.37, "learning_rate": 9.370469846498784e-06, "logits/chosen": -1.4291406869888306, "logits/rejected": -1.3295315504074097, "logps/chosen": -163.78431701660156, "logps/rejected": -61.774131774902344, "loss": 0.2737, "rewards/accuracies": 1.0, "rewards/chosen": 5.522619724273682, "rewards/margins": 0.713015079498291, "rewards/rejected": 4.809604644775391, "step": 1687 }, { "epoch": 0.37, "learning_rate": 9.369598930050466e-06, "logits/chosen": -1.3562787771224976, "logits/rejected": -1.3312386274337769, "logps/chosen": -66.04214477539062, "logps/rejected": -56.88845443725586, "loss": 0.4917, "rewards/accuracies": 1.0, "rewards/chosen": 2.3711745738983154, "rewards/margins": 0.08099627494812012, "rewards/rejected": 2.2901782989501953, "step": 1688 }, { "epoch": 0.37, "learning_rate": 9.368727452119188e-06, "logits/chosen": -1.2969541549682617, "logits/rejected": -1.247090220451355, "logps/chosen": -81.76268768310547, "logps/rejected": -118.89820861816406, "loss": 0.199, "rewards/accuracies": 1.0, "rewards/chosen": 4.633699893951416, "rewards/margins": 0.779747724533081, "rewards/rejected": 3.853952169418335, "step": 1689 }, { "epoch": 0.37, "learning_rate": 9.367855412816935e-06, "logits/chosen": -1.5742518901824951, "logits/rejected": -1.5494953393936157, "logps/chosen": -105.42390441894531, "logps/rejected": -59.80836486816406, "loss": 1.8499, "rewards/accuracies": 0.0, "rewards/chosen": 3.2829177379608154, "rewards/margins": -1.276672601699829, "rewards/rejected": 4.5595903396606445, "step": 1690 }, { "epoch": 0.37, "learning_rate": 9.366982812255764e-06, "logits/chosen": -1.4850910902023315, "logits/rejected": -1.4138681888580322, "logps/chosen": -99.17483520507812, "logps/rejected": -110.88066864013672, "loss": 0.1475, "rewards/accuracies": 1.0, "rewards/chosen": 4.35675048828125, "rewards/margins": 1.0972740650177002, "rewards/rejected": 3.25947642326355, "step": 1691 }, { "epoch": 0.37, "learning_rate": 9.366109650547798e-06, "logits/chosen": -1.4230103492736816, "logits/rejected": -1.1095824241638184, "logps/chosen": -136.4545440673828, "logps/rejected": -29.378843307495117, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": 5.440989971160889, "rewards/margins": 2.9677696228027344, "rewards/rejected": 2.4732203483581543, "step": 1692 }, { "epoch": 0.37, "learning_rate": 9.365235927805237e-06, "logits/chosen": -1.449944257736206, "logits/rejected": -1.4153110980987549, "logps/chosen": -157.8873291015625, "logps/rejected": -87.77790069580078, "loss": 0.4048, "rewards/accuracies": 1.0, "rewards/chosen": 6.8285064697265625, "rewards/margins": 3.3334152698516846, "rewards/rejected": 3.495091199874878, "step": 1693 }, { "epoch": 0.37, "learning_rate": 9.364361644140353e-06, "logits/chosen": -1.559433937072754, "logits/rejected": -1.4822077751159668, "logps/chosen": -40.003150939941406, "logps/rejected": -57.23543167114258, "loss": 0.8855, "rewards/accuracies": 0.0, "rewards/chosen": 3.0424201488494873, "rewards/margins": -0.24483466148376465, "rewards/rejected": 3.287254810333252, "step": 1694 }, { "epoch": 0.38, "learning_rate": 9.36348679966549e-06, "logits/chosen": -0.9339846968650818, "logits/rejected": -0.869559645652771, "logps/chosen": -33.13779830932617, "logps/rejected": -14.00027084350586, "loss": 0.4359, "rewards/accuracies": 1.0, "rewards/chosen": 2.359166383743286, "rewards/margins": 1.8491535186767578, "rewards/rejected": 0.5100128054618835, "step": 1695 }, { "epoch": 0.38, "learning_rate": 9.362611394493063e-06, "logits/chosen": -1.4729902744293213, "logits/rejected": -1.4537793397903442, "logps/chosen": -67.34837341308594, "logps/rejected": -62.88438034057617, "loss": 0.1741, "rewards/accuracies": 1.0, "rewards/chosen": 3.393871307373047, "rewards/margins": 0.8772890567779541, "rewards/rejected": 2.5165822505950928, "step": 1696 }, { "epoch": 0.38, "learning_rate": 9.361735428735558e-06, "logits/chosen": -1.1386315822601318, "logits/rejected": -1.159078598022461, "logps/chosen": -53.688201904296875, "logps/rejected": -44.66434860229492, "loss": 1.3507, "rewards/accuracies": 1.0, "rewards/chosen": 4.251945495605469, "rewards/margins": 1.867666244506836, "rewards/rejected": 2.384279251098633, "step": 1697 }, { "epoch": 0.38, "learning_rate": 9.360858902505539e-06, "logits/chosen": -1.292945384979248, "logits/rejected": -1.2960869073867798, "logps/chosen": -65.97393798828125, "logps/rejected": -43.324424743652344, "loss": 0.8518, "rewards/accuracies": 0.0, "rewards/chosen": 1.504377007484436, "rewards/margins": -1.2437201738357544, "rewards/rejected": 2.7480971813201904, "step": 1698 }, { "epoch": 0.38, "learning_rate": 9.359981815915632e-06, "logits/chosen": -1.684252142906189, "logits/rejected": -1.672118067741394, "logps/chosen": -58.883087158203125, "logps/rejected": -81.33561706542969, "loss": 0.8672, "rewards/accuracies": 0.0, "rewards/chosen": 2.198983907699585, "rewards/margins": -0.09512925148010254, "rewards/rejected": 2.2941131591796875, "step": 1699 }, { "epoch": 0.38, "learning_rate": 9.359104169078541e-06, "logits/chosen": -1.2059029340744019, "logits/rejected": -1.2056845426559448, "logps/chosen": -35.405479431152344, "logps/rejected": -41.95836639404297, "loss": 1.8435, "rewards/accuracies": 0.0, "rewards/chosen": 2.572101593017578, "rewards/margins": -0.35923314094543457, "rewards/rejected": 2.9313347339630127, "step": 1700 }, { "epoch": 0.38, "learning_rate": 9.358225962107047e-06, "logits/chosen": -1.5309062004089355, "logits/rejected": -1.404157042503357, "logps/chosen": -100.642333984375, "logps/rejected": -48.53639221191406, "loss": 0.4724, "rewards/accuracies": 0.0, "rewards/chosen": 3.0020415782928467, "rewards/margins": -0.3074326515197754, "rewards/rejected": 3.309474229812622, "step": 1701 }, { "epoch": 0.38, "learning_rate": 9.35734719511399e-06, "logits/chosen": -1.4070452451705933, "logits/rejected": -1.4070452451705933, "logps/chosen": -53.278594970703125, "logps/rejected": -53.278594970703125, "loss": 1.6999, "rewards/accuracies": 0.0, "rewards/chosen": 2.304914951324463, "rewards/margins": 0.0, "rewards/rejected": 2.304914951324463, "step": 1702 }, { "epoch": 0.38, "learning_rate": 9.356467868212295e-06, "logits/chosen": -1.7186039686203003, "logits/rejected": -1.5923312902450562, "logps/chosen": -92.99836730957031, "logps/rejected": -41.48428726196289, "loss": 0.2908, "rewards/accuracies": 1.0, "rewards/chosen": 5.828561305999756, "rewards/margins": 2.7500269412994385, "rewards/rejected": 3.0785343647003174, "step": 1703 }, { "epoch": 0.38, "learning_rate": 9.35558798151495e-06, "logits/chosen": -1.5075488090515137, "logits/rejected": -1.4962410926818848, "logps/chosen": -138.07347106933594, "logps/rejected": -58.106201171875, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": 6.987715244293213, "rewards/margins": 2.567074775695801, "rewards/rejected": 4.420640468597412, "step": 1704 }, { "epoch": 0.38, "learning_rate": 9.354707535135022e-06, "logits/chosen": -1.2797472476959229, "logits/rejected": -1.1418441534042358, "logps/chosen": -68.48908996582031, "logps/rejected": -5.234681129455566, "loss": 0.6315, "rewards/accuracies": 1.0, "rewards/chosen": 3.0343551635742188, "rewards/margins": 2.3445632457733154, "rewards/rejected": 0.6897918581962585, "step": 1705 }, { "epoch": 0.38, "learning_rate": 9.353826529185644e-06, "logits/chosen": -1.177781581878662, "logits/rejected": -1.177781581878662, "logps/chosen": -35.822959899902344, "logps/rejected": -35.822959899902344, "loss": 0.5975, "rewards/accuracies": 0.0, "rewards/chosen": 3.7398979663848877, "rewards/margins": 0.0, "rewards/rejected": 3.7398979663848877, "step": 1706 }, { "epoch": 0.38, "learning_rate": 9.352944963780024e-06, "logits/chosen": -1.2579561471939087, "logits/rejected": -1.2258729934692383, "logps/chosen": -26.371623992919922, "logps/rejected": -71.37092590332031, "loss": 0.9801, "rewards/accuracies": 0.0, "rewards/chosen": 1.523568034172058, "rewards/margins": -0.6013251543045044, "rewards/rejected": 2.1248931884765625, "step": 1707 }, { "epoch": 0.38, "learning_rate": 9.352062839031438e-06, "logits/chosen": -1.4350240230560303, "logits/rejected": -1.4473319053649902, "logps/chosen": -32.3564567565918, "logps/rejected": -53.50164794921875, "loss": 0.9831, "rewards/accuracies": 0.0, "rewards/chosen": 2.6032962799072266, "rewards/margins": -0.8019375801086426, "rewards/rejected": 3.405233860015869, "step": 1708 }, { "epoch": 0.38, "learning_rate": 9.351180155053242e-06, "logits/chosen": -1.3717505931854248, "logits/rejected": -1.341052770614624, "logps/chosen": -69.60133361816406, "logps/rejected": -67.8351058959961, "loss": 0.3751, "rewards/accuracies": 0.0, "rewards/chosen": 2.926295518875122, "rewards/margins": -0.08093714714050293, "rewards/rejected": 3.007232666015625, "step": 1709 }, { "epoch": 0.38, "learning_rate": 9.350296911958854e-06, "logits/chosen": -1.3020718097686768, "logits/rejected": -1.2443642616271973, "logps/chosen": -83.9163818359375, "logps/rejected": -80.90631866455078, "loss": 0.0735, "rewards/accuracies": 1.0, "rewards/chosen": 6.439265727996826, "rewards/margins": 1.9038829803466797, "rewards/rejected": 4.5353827476501465, "step": 1710 }, { "epoch": 0.38, "learning_rate": 9.34941310986177e-06, "logits/chosen": -1.293731689453125, "logits/rejected": -1.2526154518127441, "logps/chosen": -143.78762817382812, "logps/rejected": -101.13483428955078, "loss": 1.9662, "rewards/accuracies": 0.0, "rewards/chosen": 4.428042888641357, "rewards/margins": -3.9099888801574707, "rewards/rejected": 8.338031768798828, "step": 1711 }, { "epoch": 0.38, "learning_rate": 9.348528748875558e-06, "logits/chosen": -1.4225541353225708, "logits/rejected": -1.388695240020752, "logps/chosen": -81.115478515625, "logps/rejected": -80.62574768066406, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": 4.170344829559326, "rewards/margins": 1.9256243705749512, "rewards/rejected": 2.244720458984375, "step": 1712 }, { "epoch": 0.38, "learning_rate": 9.347643829113856e-06, "logits/chosen": -1.8458540439605713, "logits/rejected": -1.8746421337127686, "logps/chosen": -44.085853576660156, "logps/rejected": -77.79678344726562, "loss": 1.7575, "rewards/accuracies": 0.0, "rewards/chosen": 2.166248321533203, "rewards/margins": -1.5174903869628906, "rewards/rejected": 3.6837387084960938, "step": 1713 }, { "epoch": 0.38, "learning_rate": 9.346758350690373e-06, "logits/chosen": -1.3861547708511353, "logits/rejected": -1.375715732574463, "logps/chosen": -95.07159423828125, "logps/rejected": -89.57086944580078, "loss": 0.2831, "rewards/accuracies": 1.0, "rewards/chosen": 4.3927764892578125, "rewards/margins": 0.8117766380310059, "rewards/rejected": 3.5809998512268066, "step": 1714 }, { "epoch": 0.38, "learning_rate": 9.34587231371889e-06, "logits/chosen": -1.816390872001648, "logits/rejected": -1.7767468690872192, "logps/chosen": -140.43359375, "logps/rejected": -25.084569931030273, "loss": 0.1207, "rewards/accuracies": 1.0, "rewards/chosen": 5.846946716308594, "rewards/margins": 4.751303672790527, "rewards/rejected": 1.0956430435180664, "step": 1715 }, { "epoch": 0.38, "learning_rate": 9.344985718313264e-06, "logits/chosen": -1.3719968795776367, "logits/rejected": -1.3329226970672607, "logps/chosen": -50.57758331298828, "logps/rejected": -74.23180389404297, "loss": 0.4924, "rewards/accuracies": 0.0, "rewards/chosen": 2.680593252182007, "rewards/margins": -0.42087841033935547, "rewards/rejected": 3.1014716625213623, "step": 1716 }, { "epoch": 0.38, "learning_rate": 9.344098564587418e-06, "logits/chosen": -1.3372395038604736, "logits/rejected": -1.3372395038604736, "logps/chosen": -70.00101470947266, "logps/rejected": -70.00101470947266, "loss": 0.5389, "rewards/accuracies": 0.0, "rewards/chosen": 3.6508431434631348, "rewards/margins": 0.0, "rewards/rejected": 3.6508431434631348, "step": 1717 }, { "epoch": 0.38, "learning_rate": 9.343210852655348e-06, "logits/chosen": -1.5569666624069214, "logits/rejected": -1.5957943201065063, "logps/chosen": -35.48351287841797, "logps/rejected": -44.64860916137695, "loss": 0.4164, "rewards/accuracies": 0.0, "rewards/chosen": 2.6394894123077393, "rewards/margins": -0.04325103759765625, "rewards/rejected": 2.6827404499053955, "step": 1718 }, { "epoch": 0.38, "learning_rate": 9.342322582631125e-06, "logits/chosen": -1.5552880764007568, "logits/rejected": -1.6618624925613403, "logps/chosen": -98.89850616455078, "logps/rejected": -88.59253692626953, "loss": 3.9384, "rewards/accuracies": 0.0, "rewards/chosen": 2.276512861251831, "rewards/margins": -7.7752227783203125, "rewards/rejected": 10.051735877990723, "step": 1719 }, { "epoch": 0.38, "learning_rate": 9.341433754628888e-06, "logits/chosen": -1.3393276929855347, "logits/rejected": -1.3393276929855347, "logps/chosen": -51.642452239990234, "logps/rejected": -51.642452239990234, "loss": 0.3868, "rewards/accuracies": 0.0, "rewards/chosen": 2.4161877632141113, "rewards/margins": 0.0, "rewards/rejected": 2.4161877632141113, "step": 1720 }, { "epoch": 0.38, "learning_rate": 9.340544368762851e-06, "logits/chosen": -1.008085012435913, "logits/rejected": -0.9937337636947632, "logps/chosen": -13.140115737915039, "logps/rejected": -6.712963104248047, "loss": 0.1973, "rewards/accuracies": 1.0, "rewards/chosen": 1.1861495971679688, "rewards/margins": 0.7942125201225281, "rewards/rejected": 0.3919370770454407, "step": 1721 }, { "epoch": 0.38, "learning_rate": 9.339654425147297e-06, "logits/chosen": -1.5193800926208496, "logits/rejected": -1.4767805337905884, "logps/chosen": -57.62980651855469, "logps/rejected": -67.39566040039062, "loss": 0.1281, "rewards/accuracies": 1.0, "rewards/chosen": 5.1428542137146, "rewards/margins": 2.1624083518981934, "rewards/rejected": 2.9804458618164062, "step": 1722 }, { "epoch": 0.38, "learning_rate": 9.338763923896583e-06, "logits/chosen": -1.5359448194503784, "logits/rejected": -1.5225505828857422, "logps/chosen": -80.83180236816406, "logps/rejected": -85.81258392333984, "loss": 0.1268, "rewards/accuracies": 1.0, "rewards/chosen": 8.223398208618164, "rewards/margins": 6.280572891235352, "rewards/rejected": 1.9428253173828125, "step": 1723 }, { "epoch": 0.38, "learning_rate": 9.337872865125133e-06, "logits/chosen": -1.564554214477539, "logits/rejected": -1.430687665939331, "logps/chosen": -102.45741271972656, "logps/rejected": -72.82596588134766, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 8.887399673461914, "rewards/margins": 5.321407318115234, "rewards/rejected": 3.5659921169281006, "step": 1724 }, { "epoch": 0.38, "learning_rate": 9.336981248947447e-06, "logits/chosen": -1.5695868730545044, "logits/rejected": -1.4848291873931885, "logps/chosen": -110.64382934570312, "logps/rejected": -119.7942886352539, "loss": 0.3439, "rewards/accuracies": 1.0, "rewards/chosen": 6.626122951507568, "rewards/margins": 0.023453235626220703, "rewards/rejected": 6.602669715881348, "step": 1725 }, { "epoch": 0.38, "learning_rate": 9.336089075478098e-06, "logits/chosen": -1.6577237844467163, "logits/rejected": -1.6006873846054077, "logps/chosen": -51.261592864990234, "logps/rejected": -62.46141052246094, "loss": 0.8894, "rewards/accuracies": 0.0, "rewards/chosen": 2.419940710067749, "rewards/margins": -0.07114219665527344, "rewards/rejected": 2.4910829067230225, "step": 1726 }, { "epoch": 0.38, "learning_rate": 9.335196344831727e-06, "logits/chosen": -1.450954794883728, "logits/rejected": -1.4513039588928223, "logps/chosen": -88.09185028076172, "logps/rejected": -54.82220458984375, "loss": 0.7819, "rewards/accuracies": 0.0, "rewards/chosen": 1.8094291687011719, "rewards/margins": -0.15939640998840332, "rewards/rejected": 1.9688255786895752, "step": 1727 }, { "epoch": 0.38, "learning_rate": 9.334303057123044e-06, "logits/chosen": -1.5930579900741577, "logits/rejected": -1.4440760612487793, "logps/chosen": -158.4056854248047, "logps/rejected": -62.58935546875, "loss": 0.3591, "rewards/accuracies": 1.0, "rewards/chosen": 5.905035495758057, "rewards/margins": 2.9393112659454346, "rewards/rejected": 2.965724229812622, "step": 1728 }, { "epoch": 0.38, "learning_rate": 9.33340921246684e-06, "logits/chosen": -1.1989372968673706, "logits/rejected": -1.1981399059295654, "logps/chosen": -58.240325927734375, "logps/rejected": -43.01417541503906, "loss": 0.3491, "rewards/accuracies": 1.0, "rewards/chosen": 3.320216417312622, "rewards/margins": 0.07108163833618164, "rewards/rejected": 3.2491347789764404, "step": 1729 }, { "epoch": 0.38, "learning_rate": 9.332514810977969e-06, "logits/chosen": -1.5198380947113037, "logits/rejected": -1.54141104221344, "logps/chosen": -45.289146423339844, "logps/rejected": -46.54576110839844, "loss": 2.4566, "rewards/accuracies": 0.0, "rewards/chosen": 2.4147164821624756, "rewards/margins": -1.1028800010681152, "rewards/rejected": 3.517596483230591, "step": 1730 }, { "epoch": 0.38, "learning_rate": 9.331619852771361e-06, "logits/chosen": -1.380911946296692, "logits/rejected": -1.4515671730041504, "logps/chosen": -62.974510192871094, "logps/rejected": -92.45480346679688, "loss": 2.1884, "rewards/accuracies": 0.0, "rewards/chosen": 2.513108015060425, "rewards/margins": -4.361947059631348, "rewards/rejected": 6.875054836273193, "step": 1731 }, { "epoch": 0.38, "learning_rate": 9.330724337962013e-06, "logits/chosen": -1.5420376062393188, "logits/rejected": -1.5207078456878662, "logps/chosen": -86.5855712890625, "logps/rejected": -56.55816650390625, "loss": 1.2597, "rewards/accuracies": 1.0, "rewards/chosen": 4.945411682128906, "rewards/margins": 2.9905295372009277, "rewards/rejected": 1.954882025718689, "step": 1732 }, { "epoch": 0.38, "learning_rate": 9.329828266665e-06, "logits/chosen": -1.5005711317062378, "logits/rejected": -1.5537736415863037, "logps/chosen": -74.52993774414062, "logps/rejected": -76.0738754272461, "loss": 1.0203, "rewards/accuracies": 0.0, "rewards/chosen": 4.023184299468994, "rewards/margins": -1.8006339073181152, "rewards/rejected": 5.823818206787109, "step": 1733 }, { "epoch": 0.38, "learning_rate": 9.328931638995461e-06, "logits/chosen": -1.94866144657135, "logits/rejected": -1.8588800430297852, "logps/chosen": -66.85922241210938, "logps/rejected": -35.213626861572266, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 3.2142493724823, "rewards/margins": 2.8638529777526855, "rewards/rejected": 0.35039636492729187, "step": 1734 }, { "epoch": 0.38, "learning_rate": 9.328034455068616e-06, "logits/chosen": -1.5370115041732788, "logits/rejected": -1.6037349700927734, "logps/chosen": -52.08501434326172, "logps/rejected": -72.32501220703125, "loss": 2.5998, "rewards/accuracies": 0.0, "rewards/chosen": 2.9247353076934814, "rewards/margins": -5.159368515014648, "rewards/rejected": 8.08410358428955, "step": 1735 }, { "epoch": 0.38, "learning_rate": 9.327136714999745e-06, "logits/chosen": -1.3929072618484497, "logits/rejected": -1.343938946723938, "logps/chosen": -62.65642166137695, "logps/rejected": -67.58174896240234, "loss": 0.6568, "rewards/accuracies": 1.0, "rewards/chosen": 4.543672561645508, "rewards/margins": 1.9312174320220947, "rewards/rejected": 2.612455129623413, "step": 1736 }, { "epoch": 0.38, "learning_rate": 9.32623841890421e-06, "logits/chosen": -1.8858305215835571, "logits/rejected": -1.8730601072311401, "logps/chosen": -83.68299865722656, "logps/rejected": -22.995149612426758, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 5.944456577301025, "rewards/margins": 3.2516367435455322, "rewards/rejected": 2.692819833755493, "step": 1737 }, { "epoch": 0.38, "learning_rate": 9.325339566897437e-06, "logits/chosen": -1.352913737297058, "logits/rejected": -1.3315078020095825, "logps/chosen": -130.59275817871094, "logps/rejected": -56.66895294189453, "loss": 0.6425, "rewards/accuracies": 0.0, "rewards/chosen": 4.033882141113281, "rewards/margins": -0.15412092208862305, "rewards/rejected": 4.188003063201904, "step": 1738 }, { "epoch": 0.38, "learning_rate": 9.324440159094927e-06, "logits/chosen": -1.3217594623565674, "logits/rejected": -1.3217594623565674, "logps/chosen": -58.21525573730469, "logps/rejected": -58.21525573730469, "loss": 0.5185, "rewards/accuracies": 0.0, "rewards/chosen": 5.5436882972717285, "rewards/margins": 0.0, "rewards/rejected": 5.5436882972717285, "step": 1739 }, { "epoch": 0.39, "learning_rate": 9.323540195612255e-06, "logits/chosen": -1.3429700136184692, "logits/rejected": -1.309773325920105, "logps/chosen": -40.08062744140625, "logps/rejected": -32.86111831665039, "loss": 2.9364, "rewards/accuracies": 0.0, "rewards/chosen": 1.69798743724823, "rewards/margins": -1.9559780359268188, "rewards/rejected": 3.653965473175049, "step": 1740 }, { "epoch": 0.39, "learning_rate": 9.322639676565059e-06, "logits/chosen": -1.241504430770874, "logits/rejected": -1.2078951597213745, "logps/chosen": -48.31986999511719, "logps/rejected": -17.6771183013916, "loss": 0.3686, "rewards/accuracies": 1.0, "rewards/chosen": 1.856188178062439, "rewards/margins": 1.0773391723632812, "rewards/rejected": 0.7788490653038025, "step": 1741 }, { "epoch": 0.39, "learning_rate": 9.321738602069057e-06, "logits/chosen": -1.4761244058609009, "logits/rejected": -1.4571632146835327, "logps/chosen": -44.69145202636719, "logps/rejected": -68.92474365234375, "loss": 1.2772, "rewards/accuracies": 0.0, "rewards/chosen": 2.7128663063049316, "rewards/margins": -2.0354738235473633, "rewards/rejected": 4.748340129852295, "step": 1742 }, { "epoch": 0.39, "learning_rate": 9.320836972240034e-06, "logits/chosen": -1.5967966318130493, "logits/rejected": -1.646048903465271, "logps/chosen": -45.36518859863281, "logps/rejected": -118.57838439941406, "loss": 0.2643, "rewards/accuracies": 1.0, "rewards/chosen": 2.1204469203948975, "rewards/margins": 0.37251901626586914, "rewards/rejected": 1.7479279041290283, "step": 1743 }, { "epoch": 0.39, "learning_rate": 9.319934787193846e-06, "logits/chosen": -1.2941639423370361, "logits/rejected": -1.2494317293167114, "logps/chosen": -73.33719635009766, "logps/rejected": -58.91375732421875, "loss": 1.8342, "rewards/accuracies": 0.0, "rewards/chosen": 2.059282064437866, "rewards/margins": -2.504992723464966, "rewards/rejected": 4.564274787902832, "step": 1744 }, { "epoch": 0.39, "learning_rate": 9.319032047046422e-06, "logits/chosen": -1.5003337860107422, "logits/rejected": -1.381629467010498, "logps/chosen": -95.90357971191406, "logps/rejected": -122.3731460571289, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 5.829753398895264, "rewards/margins": 3.806917905807495, "rewards/rejected": 2.0228354930877686, "step": 1745 }, { "epoch": 0.39, "learning_rate": 9.318128751913764e-06, "logits/chosen": -1.6205813884735107, "logits/rejected": -1.6140601634979248, "logps/chosen": -67.64204406738281, "logps/rejected": -70.49345397949219, "loss": 1.1342, "rewards/accuracies": 0.0, "rewards/chosen": 1.208319067955017, "rewards/margins": -2.158320903778076, "rewards/rejected": 3.3666398525238037, "step": 1746 }, { "epoch": 0.39, "learning_rate": 9.317224901911941e-06, "logits/chosen": -1.9291186332702637, "logits/rejected": -1.9127517938613892, "logps/chosen": -66.07887268066406, "logps/rejected": -78.67462158203125, "loss": 1.2465, "rewards/accuracies": 0.0, "rewards/chosen": 3.3214797973632812, "rewards/margins": -1.3594269752502441, "rewards/rejected": 4.680906772613525, "step": 1747 }, { "epoch": 0.39, "learning_rate": 9.316320497157097e-06, "logits/chosen": -1.4712644815444946, "logits/rejected": -1.4596647024154663, "logps/chosen": -33.384971618652344, "logps/rejected": -54.45400619506836, "loss": 0.8395, "rewards/accuracies": 0.0, "rewards/chosen": 1.1404980421066284, "rewards/margins": -1.4618126153945923, "rewards/rejected": 2.6023106575012207, "step": 1748 }, { "epoch": 0.39, "learning_rate": 9.315415537765446e-06, "logits/chosen": -1.498984456062317, "logits/rejected": -1.212235927581787, "logps/chosen": -126.74264526367188, "logps/rejected": -23.209924697875977, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": 4.104498386383057, "rewards/margins": 3.5682735443115234, "rewards/rejected": 0.5362247824668884, "step": 1749 }, { "epoch": 0.39, "learning_rate": 9.314510023853272e-06, "logits/chosen": -1.7094539403915405, "logits/rejected": -1.6549514532089233, "logps/chosen": -33.94266128540039, "logps/rejected": -29.55668830871582, "loss": 1.4643, "rewards/accuracies": 0.0, "rewards/chosen": 2.073566436767578, "rewards/margins": -1.772902011871338, "rewards/rejected": 3.846468448638916, "step": 1750 }, { "epoch": 0.39, "learning_rate": 9.313603955536931e-06, "logits/chosen": -1.6622666120529175, "logits/rejected": -1.655590295791626, "logps/chosen": -45.22948455810547, "logps/rejected": -83.47274780273438, "loss": 1.1039, "rewards/accuracies": 0.0, "rewards/chosen": 2.294175863265991, "rewards/margins": -1.487790584564209, "rewards/rejected": 3.7819664478302, "step": 1751 }, { "epoch": 0.39, "learning_rate": 9.312697332932852e-06, "logits/chosen": -1.415014386177063, "logits/rejected": -1.3950467109680176, "logps/chosen": -45.3390998840332, "logps/rejected": -50.73759078979492, "loss": 0.1905, "rewards/accuracies": 1.0, "rewards/chosen": 3.1851253509521484, "rewards/margins": 0.8576576709747314, "rewards/rejected": 2.327467679977417, "step": 1752 }, { "epoch": 0.39, "learning_rate": 9.311790156157533e-06, "logits/chosen": -1.3169784545898438, "logits/rejected": -1.0271974802017212, "logps/chosen": -122.86080932617188, "logps/rejected": -38.308258056640625, "loss": 0.3732, "rewards/accuracies": 0.0, "rewards/chosen": 3.3257293701171875, "rewards/margins": -0.011270999908447266, "rewards/rejected": 3.3370003700256348, "step": 1753 }, { "epoch": 0.39, "learning_rate": 9.310882425327544e-06, "logits/chosen": -1.7115174531936646, "logits/rejected": -1.6590638160705566, "logps/chosen": -86.84862518310547, "logps/rejected": -123.42837524414062, "loss": 0.4166, "rewards/accuracies": 0.0, "rewards/chosen": 5.959407806396484, "rewards/margins": -0.259005069732666, "rewards/rejected": 6.21841287612915, "step": 1754 }, { "epoch": 0.39, "learning_rate": 9.309974140559525e-06, "logits/chosen": -1.6641578674316406, "logits/rejected": -1.4606120586395264, "logps/chosen": -48.044822692871094, "logps/rejected": -61.66693878173828, "loss": 1.6098, "rewards/accuracies": 0.0, "rewards/chosen": 2.701073408126831, "rewards/margins": -3.173635244369507, "rewards/rejected": 5.874708652496338, "step": 1755 }, { "epoch": 0.39, "learning_rate": 9.309065301970193e-06, "logits/chosen": -1.6992199420928955, "logits/rejected": -1.657706618309021, "logps/chosen": -57.90958023071289, "logps/rejected": -50.71057891845703, "loss": 0.778, "rewards/accuracies": 1.0, "rewards/chosen": 3.611114263534546, "rewards/margins": 0.028735876083374023, "rewards/rejected": 3.582378387451172, "step": 1756 }, { "epoch": 0.39, "learning_rate": 9.308155909676326e-06, "logits/chosen": -1.239009976387024, "logits/rejected": -1.1833208799362183, "logps/chosen": -33.52569580078125, "logps/rejected": -26.459148406982422, "loss": 0.1199, "rewards/accuracies": 1.0, "rewards/chosen": 3.1984620094299316, "rewards/margins": 1.4982277154922485, "rewards/rejected": 1.700234293937683, "step": 1757 }, { "epoch": 0.39, "learning_rate": 9.307245963794782e-06, "logits/chosen": -1.6051476001739502, "logits/rejected": -1.4734665155410767, "logps/chosen": -103.80553436279297, "logps/rejected": -98.14794921875, "loss": 0.2134, "rewards/accuracies": 1.0, "rewards/chosen": 8.355986595153809, "rewards/margins": 6.419571399688721, "rewards/rejected": 1.9364150762557983, "step": 1758 }, { "epoch": 0.39, "learning_rate": 9.306335464442485e-06, "logits/chosen": -1.3394734859466553, "logits/rejected": -1.3394734859466553, "logps/chosen": -61.868404388427734, "logps/rejected": -61.868404388427734, "loss": 0.3538, "rewards/accuracies": 0.0, "rewards/chosen": 4.7076544761657715, "rewards/margins": 0.0, "rewards/rejected": 4.7076544761657715, "step": 1759 }, { "epoch": 0.39, "learning_rate": 9.305424411736434e-06, "logits/chosen": -1.547992467880249, "logits/rejected": -1.4599140882492065, "logps/chosen": -70.93899536132812, "logps/rejected": -52.785865783691406, "loss": 0.3326, "rewards/accuracies": 1.0, "rewards/chosen": 1.5444778203964233, "rewards/margins": 1.0023760795593262, "rewards/rejected": 0.5421016812324524, "step": 1760 }, { "epoch": 0.39, "learning_rate": 9.304512805793696e-06, "logits/chosen": -1.3181111812591553, "logits/rejected": -1.07584547996521, "logps/chosen": -57.63178253173828, "logps/rejected": -141.44482421875, "loss": 1.5126, "rewards/accuracies": 0.0, "rewards/chosen": 3.2002739906311035, "rewards/margins": -1.4649605751037598, "rewards/rejected": 4.665234565734863, "step": 1761 }, { "epoch": 0.39, "learning_rate": 9.30360064673141e-06, "logits/chosen": -1.7933108806610107, "logits/rejected": -1.785913348197937, "logps/chosen": -97.47924041748047, "logps/rejected": -59.399253845214844, "loss": 0.0781, "rewards/accuracies": 1.0, "rewards/chosen": 5.9632697105407715, "rewards/margins": 2.502664804458618, "rewards/rejected": 3.4606049060821533, "step": 1762 }, { "epoch": 0.39, "learning_rate": 9.302687934666787e-06, "logits/chosen": -1.5606106519699097, "logits/rejected": -1.5573285818099976, "logps/chosen": -160.99465942382812, "logps/rejected": -172.441650390625, "loss": 0.1058, "rewards/accuracies": 1.0, "rewards/chosen": 9.038782119750977, "rewards/margins": 1.5860567092895508, "rewards/rejected": 7.452725410461426, "step": 1763 }, { "epoch": 0.39, "learning_rate": 9.301774669717108e-06, "logits/chosen": -1.2841116189956665, "logits/rejected": -1.2511353492736816, "logps/chosen": -70.94015502929688, "logps/rejected": -236.28631591796875, "loss": 3.5837, "rewards/accuracies": 0.0, "rewards/chosen": 4.343560218811035, "rewards/margins": -5.112146377563477, "rewards/rejected": 9.455706596374512, "step": 1764 }, { "epoch": 0.39, "learning_rate": 9.300860851999723e-06, "logits/chosen": -1.575122594833374, "logits/rejected": -1.46320378780365, "logps/chosen": -76.38993835449219, "logps/rejected": -71.76358032226562, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 5.260364055633545, "rewards/margins": 3.0950496196746826, "rewards/rejected": 2.1653144359588623, "step": 1765 }, { "epoch": 0.39, "learning_rate": 9.299946481632058e-06, "logits/chosen": -1.6317720413208008, "logits/rejected": -1.6508976221084595, "logps/chosen": -45.21567153930664, "logps/rejected": -97.61192321777344, "loss": 1.2312, "rewards/accuracies": 0.0, "rewards/chosen": 6.093961715698242, "rewards/margins": -0.979454517364502, "rewards/rejected": 7.073416233062744, "step": 1766 }, { "epoch": 0.39, "learning_rate": 9.299031558731608e-06, "logits/chosen": -1.8463202714920044, "logits/rejected": -1.7777985334396362, "logps/chosen": -37.46868133544922, "logps/rejected": -16.226396560668945, "loss": 1.0019, "rewards/accuracies": 1.0, "rewards/chosen": 3.634014129638672, "rewards/margins": 2.5191640853881836, "rewards/rejected": 1.1148500442504883, "step": 1767 }, { "epoch": 0.39, "learning_rate": 9.298116083415937e-06, "logits/chosen": -1.2528703212738037, "logits/rejected": -1.2277028560638428, "logps/chosen": -62.6547737121582, "logps/rejected": -48.3031005859375, "loss": 1.0465, "rewards/accuracies": 0.0, "rewards/chosen": 1.9434833526611328, "rewards/margins": -1.926696538925171, "rewards/rejected": 3.8701798915863037, "step": 1768 }, { "epoch": 0.39, "learning_rate": 9.297200055802683e-06, "logits/chosen": -1.291256070137024, "logits/rejected": -1.3154176473617554, "logps/chosen": -81.43426513671875, "logps/rejected": -108.42884063720703, "loss": 0.6726, "rewards/accuracies": 0.0, "rewards/chosen": 3.607374668121338, "rewards/margins": -0.8504128456115723, "rewards/rejected": 4.45778751373291, "step": 1769 }, { "epoch": 0.39, "learning_rate": 9.296283476009551e-06, "logits/chosen": -1.4107489585876465, "logits/rejected": -1.3428916931152344, "logps/chosen": -71.68632507324219, "logps/rejected": -19.8802433013916, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 3.9154441356658936, "rewards/margins": 3.69940447807312, "rewards/rejected": 0.21603965759277344, "step": 1770 }, { "epoch": 0.39, "learning_rate": 9.295366344154319e-06, "logits/chosen": -1.3913226127624512, "logits/rejected": -1.267946481704712, "logps/chosen": -104.34080505371094, "logps/rejected": -35.3742561340332, "loss": 1.8371, "rewards/accuracies": 1.0, "rewards/chosen": 4.0940399169921875, "rewards/margins": 3.829017162322998, "rewards/rejected": 0.2650226652622223, "step": 1771 }, { "epoch": 0.39, "learning_rate": 9.29444866035484e-06, "logits/chosen": -1.3642946481704712, "logits/rejected": -1.3605681657791138, "logps/chosen": -88.64596557617188, "logps/rejected": -75.1116714477539, "loss": 2.92, "rewards/accuracies": 0.0, "rewards/chosen": 0.9440551996231079, "rewards/margins": -5.6745171546936035, "rewards/rejected": 6.618572235107422, "step": 1772 }, { "epoch": 0.39, "learning_rate": 9.293530424729029e-06, "logits/chosen": -1.6114412546157837, "logits/rejected": -1.5176266431808472, "logps/chosen": -43.043304443359375, "logps/rejected": -24.115901947021484, "loss": 0.0907, "rewards/accuracies": 1.0, "rewards/chosen": 1.6398247480392456, "rewards/margins": 1.6264649629592896, "rewards/rejected": 0.01335983257740736, "step": 1773 }, { "epoch": 0.39, "learning_rate": 9.292611637394881e-06, "logits/chosen": -1.2039148807525635, "logits/rejected": -1.1523032188415527, "logps/chosen": -64.28892517089844, "logps/rejected": -48.67866897583008, "loss": 0.1766, "rewards/accuracies": 1.0, "rewards/chosen": 2.9237639904022217, "rewards/margins": 0.8617982864379883, "rewards/rejected": 2.0619657039642334, "step": 1774 }, { "epoch": 0.39, "learning_rate": 9.291692298470457e-06, "logits/chosen": -1.4333879947662354, "logits/rejected": -1.3936545848846436, "logps/chosen": -97.70323181152344, "logps/rejected": -57.26881408691406, "loss": 0.1709, "rewards/accuracies": 1.0, "rewards/chosen": 4.796247959136963, "rewards/margins": 1.1985626220703125, "rewards/rejected": 3.5976853370666504, "step": 1775 }, { "epoch": 0.39, "learning_rate": 9.29077240807389e-06, "logits/chosen": -1.8737835884094238, "logits/rejected": -1.8313521146774292, "logps/chosen": -95.92005157470703, "logps/rejected": -84.70323181152344, "loss": 1.6649, "rewards/accuracies": 0.0, "rewards/chosen": 1.301129937171936, "rewards/margins": -2.212876319885254, "rewards/rejected": 3.5140061378479004, "step": 1776 }, { "epoch": 0.39, "learning_rate": 9.289851966323382e-06, "logits/chosen": -1.55984365940094, "logits/rejected": -1.4686832427978516, "logps/chosen": -85.42874908447266, "logps/rejected": -81.70036315917969, "loss": 0.1831, "rewards/accuracies": 1.0, "rewards/chosen": 4.6394219398498535, "rewards/margins": 1.457883596420288, "rewards/rejected": 3.1815383434295654, "step": 1777 }, { "epoch": 0.39, "learning_rate": 9.288930973337212e-06, "logits/chosen": -1.3949589729309082, "logits/rejected": -1.2737162113189697, "logps/chosen": -56.806427001953125, "logps/rejected": -39.01237487792969, "loss": 0.3144, "rewards/accuracies": 1.0, "rewards/chosen": 2.6805801391601562, "rewards/margins": 1.753842830657959, "rewards/rejected": 0.9267372488975525, "step": 1778 }, { "epoch": 0.39, "learning_rate": 9.288009429233717e-06, "logits/chosen": -1.3654673099517822, "logits/rejected": -1.36713707447052, "logps/chosen": -56.55582046508789, "logps/rejected": -57.634681701660156, "loss": 1.0348, "rewards/accuracies": 0.0, "rewards/chosen": 3.501333236694336, "rewards/margins": -0.32389259338378906, "rewards/rejected": 3.825225830078125, "step": 1779 }, { "epoch": 0.39, "learning_rate": 9.287087334131322e-06, "logits/chosen": -1.4404035806655884, "logits/rejected": -1.3824526071548462, "logps/chosen": -187.0799560546875, "logps/rejected": -172.5933074951172, "loss": 0.1314, "rewards/accuracies": 1.0, "rewards/chosen": 10.015353202819824, "rewards/margins": 1.283961296081543, "rewards/rejected": 8.731391906738281, "step": 1780 }, { "epoch": 0.39, "learning_rate": 9.28616468814851e-06, "logits/chosen": -1.5122299194335938, "logits/rejected": -1.4534856081008911, "logps/chosen": -47.46631622314453, "logps/rejected": -50.452117919921875, "loss": 0.5722, "rewards/accuracies": 0.0, "rewards/chosen": 2.9383492469787598, "rewards/margins": -0.6028938293457031, "rewards/rejected": 3.541243076324463, "step": 1781 }, { "epoch": 0.39, "learning_rate": 9.28524149140384e-06, "logits/chosen": -1.4214136600494385, "logits/rejected": -1.346156358718872, "logps/chosen": -87.98591613769531, "logps/rejected": -64.57892608642578, "loss": 0.9121, "rewards/accuracies": 1.0, "rewards/chosen": 4.222825527191162, "rewards/margins": 3.2425849437713623, "rewards/rejected": 0.9802406430244446, "step": 1782 }, { "epoch": 0.39, "learning_rate": 9.284317744015938e-06, "logits/chosen": -1.5573419332504272, "logits/rejected": -1.4890501499176025, "logps/chosen": -62.94169235229492, "logps/rejected": -50.060142517089844, "loss": 0.3767, "rewards/accuracies": 0.0, "rewards/chosen": 2.626436233520508, "rewards/margins": -0.10425686836242676, "rewards/rejected": 2.7306931018829346, "step": 1783 }, { "epoch": 0.39, "learning_rate": 9.283393446103506e-06, "logits/chosen": -1.9787266254425049, "logits/rejected": -1.87132728099823, "logps/chosen": -68.32405090332031, "logps/rejected": -53.02394485473633, "loss": 0.6986, "rewards/accuracies": 0.0, "rewards/chosen": 2.543891191482544, "rewards/margins": -1.0604825019836426, "rewards/rejected": 3.6043736934661865, "step": 1784 }, { "epoch": 0.4, "learning_rate": 9.282468597785312e-06, "logits/chosen": -1.4578429460525513, "logits/rejected": -1.4578429460525513, "logps/chosen": -91.98133850097656, "logps/rejected": -91.98133850097656, "loss": 0.347, "rewards/accuracies": 0.0, "rewards/chosen": 2.8760955333709717, "rewards/margins": 0.0, "rewards/rejected": 2.8760955333709717, "step": 1785 }, { "epoch": 0.4, "learning_rate": 9.2815431991802e-06, "logits/chosen": -1.277057409286499, "logits/rejected": -1.2710598707199097, "logps/chosen": -29.914384841918945, "logps/rejected": -9.260367393493652, "loss": 0.3935, "rewards/accuracies": 1.0, "rewards/chosen": 2.3122687339782715, "rewards/margins": 1.1697238683700562, "rewards/rejected": 1.1425448656082153, "step": 1786 }, { "epoch": 0.4, "learning_rate": 9.280617250407078e-06, "logits/chosen": -1.7555127143859863, "logits/rejected": -1.7113980054855347, "logps/chosen": -45.11138153076172, "logps/rejected": -22.750951766967773, "loss": 0.1639, "rewards/accuracies": 1.0, "rewards/chosen": 3.1219565868377686, "rewards/margins": 1.376355528831482, "rewards/rejected": 1.7456010580062866, "step": 1787 }, { "epoch": 0.4, "learning_rate": 9.27969075158493e-06, "logits/chosen": -1.7613325119018555, "logits/rejected": -1.7692021131515503, "logps/chosen": -84.57439422607422, "logps/rejected": -33.68363952636719, "loss": 0.4116, "rewards/accuracies": 0.0, "rewards/chosen": 3.00445556640625, "rewards/margins": -0.18884587287902832, "rewards/rejected": 3.1933014392852783, "step": 1788 }, { "epoch": 0.4, "learning_rate": 9.278763702832809e-06, "logits/chosen": -1.2325992584228516, "logits/rejected": -1.2397990226745605, "logps/chosen": -70.58378601074219, "logps/rejected": -43.20512390136719, "loss": 0.896, "rewards/accuracies": 0.0, "rewards/chosen": 2.8259377479553223, "rewards/margins": -1.4833288192749023, "rewards/rejected": 4.309266567230225, "step": 1789 }, { "epoch": 0.4, "learning_rate": 9.277836104269837e-06, "logits/chosen": -1.3365532159805298, "logits/rejected": -1.2578036785125732, "logps/chosen": -41.5611572265625, "logps/rejected": -37.93387222290039, "loss": 2.1994, "rewards/accuracies": 1.0, "rewards/chosen": 1.924081802368164, "rewards/margins": 1.0129951238632202, "rewards/rejected": 0.9110866785049438, "step": 1790 }, { "epoch": 0.4, "learning_rate": 9.276907956015212e-06, "logits/chosen": -1.4507148265838623, "logits/rejected": -1.477834701538086, "logps/chosen": -81.09268188476562, "logps/rejected": -39.728607177734375, "loss": 1.7552, "rewards/accuracies": 0.0, "rewards/chosen": 2.0066604614257812, "rewards/margins": -2.103328227996826, "rewards/rejected": 4.109988689422607, "step": 1791 }, { "epoch": 0.4, "learning_rate": 9.275979258188192e-06, "logits/chosen": -1.355169653892517, "logits/rejected": -1.2571781873703003, "logps/chosen": -59.12030029296875, "logps/rejected": -61.340126037597656, "loss": 1.7685, "rewards/accuracies": 0.0, "rewards/chosen": 2.3797073364257812, "rewards/margins": -1.2707335948944092, "rewards/rejected": 3.6504409313201904, "step": 1792 }, { "epoch": 0.4, "learning_rate": 9.275050010908118e-06, "logits/chosen": -1.6148163080215454, "logits/rejected": -1.5839436054229736, "logps/chosen": -60.934879302978516, "logps/rejected": -67.94281768798828, "loss": 0.3371, "rewards/accuracies": 1.0, "rewards/chosen": 3.7890331745147705, "rewards/margins": 0.04520368576049805, "rewards/rejected": 3.7438294887542725, "step": 1793 }, { "epoch": 0.4, "learning_rate": 9.274120214294395e-06, "logits/chosen": -1.1371384859085083, "logits/rejected": -1.1123220920562744, "logps/chosen": -66.7076187133789, "logps/rejected": -45.02006530761719, "loss": 0.8114, "rewards/accuracies": 0.0, "rewards/chosen": 2.6197669506073, "rewards/margins": -0.0854485034942627, "rewards/rejected": 2.7052154541015625, "step": 1794 }, { "epoch": 0.4, "learning_rate": 9.273189868466499e-06, "logits/chosen": -1.0870031118392944, "logits/rejected": -1.1584484577178955, "logps/chosen": -34.33595275878906, "logps/rejected": -83.79864501953125, "loss": 1.4718, "rewards/accuracies": 0.0, "rewards/chosen": 3.015214681625366, "rewards/margins": -2.7970564365386963, "rewards/rejected": 5.8122711181640625, "step": 1795 }, { "epoch": 0.4, "learning_rate": 9.272258973543977e-06, "logits/chosen": -1.2968838214874268, "logits/rejected": -1.2808696031570435, "logps/chosen": -55.596519470214844, "logps/rejected": -45.34252166748047, "loss": 0.5698, "rewards/accuracies": 0.0, "rewards/chosen": 2.2573158740997314, "rewards/margins": -0.6827247142791748, "rewards/rejected": 2.9400405883789062, "step": 1796 }, { "epoch": 0.4, "learning_rate": 9.271327529646447e-06, "logits/chosen": -1.824878215789795, "logits/rejected": -1.869339942932129, "logps/chosen": -83.78856658935547, "logps/rejected": -28.18172264099121, "loss": 0.5974, "rewards/accuracies": 1.0, "rewards/chosen": 3.195903778076172, "rewards/margins": 1.0380737781524658, "rewards/rejected": 2.157829999923706, "step": 1797 }, { "epoch": 0.4, "learning_rate": 9.270395536893599e-06, "logits/chosen": -1.1444014310836792, "logits/rejected": -0.9951407313346863, "logps/chosen": -73.92533874511719, "logps/rejected": -38.00094223022461, "loss": 0.3282, "rewards/accuracies": 1.0, "rewards/chosen": 2.4471375942230225, "rewards/margins": 1.2456005811691284, "rewards/rejected": 1.201537013053894, "step": 1798 }, { "epoch": 0.4, "learning_rate": 9.269462995405189e-06, "logits/chosen": -1.6263834238052368, "logits/rejected": -1.4330247640609741, "logps/chosen": -131.8964385986328, "logps/rejected": -79.33077239990234, "loss": 0.1099, "rewards/accuracies": 1.0, "rewards/chosen": 10.411870002746582, "rewards/margins": 10.832695960998535, "rewards/rejected": -0.4208259582519531, "step": 1799 }, { "epoch": 0.4, "learning_rate": 9.268529905301049e-06, "logits/chosen": -1.3388854265213013, "logits/rejected": -1.2845892906188965, "logps/chosen": -54.24066162109375, "logps/rejected": -52.80756378173828, "loss": 0.1974, "rewards/accuracies": 1.0, "rewards/chosen": 4.057366847991943, "rewards/margins": 1.2152478694915771, "rewards/rejected": 2.842118978500366, "step": 1800 }, { "epoch": 0.4, "learning_rate": 9.267596266701076e-06, "logits/chosen": -1.455287218093872, "logits/rejected": -1.455287218093872, "logps/chosen": -30.006973266601562, "logps/rejected": -30.006973266601562, "loss": 0.7467, "rewards/accuracies": 0.0, "rewards/chosen": 4.339795112609863, "rewards/margins": 0.0, "rewards/rejected": 4.339795112609863, "step": 1801 }, { "epoch": 0.4, "learning_rate": 9.266662079725241e-06, "logits/chosen": -1.5451900959014893, "logits/rejected": -1.4422080516815186, "logps/chosen": -89.22760009765625, "logps/rejected": -59.274803161621094, "loss": 0.1332, "rewards/accuracies": 1.0, "rewards/chosen": 4.5939788818359375, "rewards/margins": 1.1970603466033936, "rewards/rejected": 3.396918535232544, "step": 1802 }, { "epoch": 0.4, "learning_rate": 9.265727344493587e-06, "logits/chosen": -1.22208571434021, "logits/rejected": -1.1711686849594116, "logps/chosen": -73.48944091796875, "logps/rejected": -34.51213073730469, "loss": 0.7647, "rewards/accuracies": 0.0, "rewards/chosen": 1.7339324951171875, "rewards/margins": -1.1248290538787842, "rewards/rejected": 2.8587615489959717, "step": 1803 }, { "epoch": 0.4, "learning_rate": 9.264792061126224e-06, "logits/chosen": -1.2649365663528442, "logits/rejected": -1.2587461471557617, "logps/chosen": -34.6595344543457, "logps/rejected": -67.99434661865234, "loss": 1.1534, "rewards/accuracies": 0.0, "rewards/chosen": 3.1785786151885986, "rewards/margins": -1.9017713069915771, "rewards/rejected": 5.080349922180176, "step": 1804 }, { "epoch": 0.4, "learning_rate": 9.263856229743334e-06, "logits/chosen": -1.5752830505371094, "logits/rejected": -1.5441780090332031, "logps/chosen": -81.77473449707031, "logps/rejected": -141.0889434814453, "loss": 0.9998, "rewards/accuracies": 0.0, "rewards/chosen": 5.998080730438232, "rewards/margins": -1.8460311889648438, "rewards/rejected": 7.844111919403076, "step": 1805 }, { "epoch": 0.4, "learning_rate": 9.262919850465166e-06, "logits/chosen": -1.6964187622070312, "logits/rejected": -1.675478458404541, "logps/chosen": -59.91899490356445, "logps/rejected": -59.86107635498047, "loss": 0.9492, "rewards/accuracies": 0.0, "rewards/chosen": 2.6280133724212646, "rewards/margins": -1.487973928451538, "rewards/rejected": 4.115987300872803, "step": 1806 }, { "epoch": 0.4, "learning_rate": 9.261982923412046e-06, "logits/chosen": -1.1990935802459717, "logits/rejected": -1.041049599647522, "logps/chosen": -57.470375061035156, "logps/rejected": -10.766892433166504, "loss": 0.0877, "rewards/accuracies": 1.0, "rewards/chosen": 2.6858787536621094, "rewards/margins": 1.829197883605957, "rewards/rejected": 0.8566808104515076, "step": 1807 }, { "epoch": 0.4, "learning_rate": 9.261045448704367e-06, "logits/chosen": -1.062685489654541, "logits/rejected": -1.038108229637146, "logps/chosen": -37.77836990356445, "logps/rejected": -32.34926986694336, "loss": 1.3711, "rewards/accuracies": 1.0, "rewards/chosen": 1.8820819854736328, "rewards/margins": 0.23055493831634521, "rewards/rejected": 1.6515270471572876, "step": 1808 }, { "epoch": 0.4, "learning_rate": 9.26010742646259e-06, "logits/chosen": -1.3421064615249634, "logits/rejected": -1.3411085605621338, "logps/chosen": -41.04907989501953, "logps/rejected": -27.74225616455078, "loss": 1.6968, "rewards/accuracies": 0.0, "rewards/chosen": 0.9480213522911072, "rewards/margins": -0.5148292183876038, "rewards/rejected": 1.462850570678711, "step": 1809 }, { "epoch": 0.4, "learning_rate": 9.259168856807249e-06, "logits/chosen": -1.6129236221313477, "logits/rejected": -1.504226565361023, "logps/chosen": -64.3719482421875, "logps/rejected": -84.33837890625, "loss": 0.2838, "rewards/accuracies": 1.0, "rewards/chosen": 5.582674503326416, "rewards/margins": 2.910032033920288, "rewards/rejected": 2.672642469406128, "step": 1810 }, { "epoch": 0.4, "learning_rate": 9.25822973985895e-06, "logits/chosen": -1.1688690185546875, "logits/rejected": -1.2316391468048096, "logps/chosen": -104.265625, "logps/rejected": -88.5650634765625, "loss": 2.5011, "rewards/accuracies": 0.0, "rewards/chosen": 2.4249298572540283, "rewards/margins": -4.960195541381836, "rewards/rejected": 7.385125637054443, "step": 1811 }, { "epoch": 0.4, "learning_rate": 9.257290075738365e-06, "logits/chosen": -1.7207492589950562, "logits/rejected": -1.6436333656311035, "logps/chosen": -68.10299682617188, "logps/rejected": -80.10967254638672, "loss": 1.2526, "rewards/accuracies": 1.0, "rewards/chosen": 5.785576820373535, "rewards/margins": 1.3681178092956543, "rewards/rejected": 4.417459011077881, "step": 1812 }, { "epoch": 0.4, "learning_rate": 9.25634986456624e-06, "logits/chosen": -1.7849156856536865, "logits/rejected": -1.763616919517517, "logps/chosen": -37.40155029296875, "logps/rejected": -104.84066772460938, "loss": 3.2957, "rewards/accuracies": 0.0, "rewards/chosen": 2.549807071685791, "rewards/margins": -0.6566383838653564, "rewards/rejected": 3.2064454555511475, "step": 1813 }, { "epoch": 0.4, "learning_rate": 9.25540910646339e-06, "logits/chosen": -1.7118351459503174, "logits/rejected": -1.7466440200805664, "logps/chosen": -41.815162658691406, "logps/rejected": -128.77432250976562, "loss": 0.5312, "rewards/accuracies": 0.0, "rewards/chosen": 3.538498640060425, "rewards/margins": -0.6177241802215576, "rewards/rejected": 4.156222820281982, "step": 1814 }, { "epoch": 0.4, "learning_rate": 9.254467801550699e-06, "logits/chosen": -1.4477720260620117, "logits/rejected": -1.5405101776123047, "logps/chosen": -117.11175537109375, "logps/rejected": -65.81484985351562, "loss": 0.1493, "rewards/accuracies": 1.0, "rewards/chosen": 3.3492767810821533, "rewards/margins": 1.0564758777618408, "rewards/rejected": 2.2928009033203125, "step": 1815 }, { "epoch": 0.4, "learning_rate": 9.253525949949123e-06, "logits/chosen": -1.273246169090271, "logits/rejected": -1.244083046913147, "logps/chosen": -123.29181671142578, "logps/rejected": -84.52276611328125, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": 7.135837554931641, "rewards/margins": 3.478256940841675, "rewards/rejected": 3.657580614089966, "step": 1816 }, { "epoch": 0.4, "learning_rate": 9.252583551779687e-06, "logits/chosen": -1.74118173122406, "logits/rejected": -1.733847975730896, "logps/chosen": -143.7974090576172, "logps/rejected": -144.71664428710938, "loss": 0.6125, "rewards/accuracies": 0.0, "rewards/chosen": 7.596421718597412, "rewards/margins": -0.8052964210510254, "rewards/rejected": 8.401718139648438, "step": 1817 }, { "epoch": 0.4, "learning_rate": 9.251640607163488e-06, "logits/chosen": -1.2887301445007324, "logits/rejected": -1.2033040523529053, "logps/chosen": -35.64884567260742, "logps/rejected": -23.72933006286621, "loss": 0.1138, "rewards/accuracies": 1.0, "rewards/chosen": 3.323364019393921, "rewards/margins": 1.4957709312438965, "rewards/rejected": 1.8275930881500244, "step": 1818 }, { "epoch": 0.4, "learning_rate": 9.250697116221692e-06, "logits/chosen": -1.308727741241455, "logits/rejected": -1.2998515367507935, "logps/chosen": -79.82659912109375, "logps/rejected": -72.38892364501953, "loss": 1.1587, "rewards/accuracies": 0.0, "rewards/chosen": 2.817822217941284, "rewards/margins": -2.1068003177642822, "rewards/rejected": 4.924622535705566, "step": 1819 }, { "epoch": 0.4, "learning_rate": 9.249753079075534e-06, "logits/chosen": -1.2292454242706299, "logits/rejected": -1.127728819847107, "logps/chosen": -84.11012268066406, "logps/rejected": -67.94694519042969, "loss": 2.0559, "rewards/accuracies": 0.0, "rewards/chosen": 2.2890961170196533, "rewards/margins": -1.7758147716522217, "rewards/rejected": 4.064910888671875, "step": 1820 }, { "epoch": 0.4, "learning_rate": 9.248808495846322e-06, "logits/chosen": -1.5069704055786133, "logits/rejected": -1.4401086568832397, "logps/chosen": -126.25825500488281, "logps/rejected": -69.86125183105469, "loss": 0.4786, "rewards/accuracies": 1.0, "rewards/chosen": 3.568415880203247, "rewards/margins": 0.7369468212127686, "rewards/rejected": 2.8314690589904785, "step": 1821 }, { "epoch": 0.4, "learning_rate": 9.247863366655434e-06, "logits/chosen": -1.1816867589950562, "logits/rejected": -1.1567038297653198, "logps/chosen": -55.1234130859375, "logps/rejected": -62.242549896240234, "loss": 0.2136, "rewards/accuracies": 1.0, "rewards/chosen": 3.291577100753784, "rewards/margins": 1.2036752700805664, "rewards/rejected": 2.0879018306732178, "step": 1822 }, { "epoch": 0.4, "learning_rate": 9.246917691624314e-06, "logits/chosen": -1.200852870941162, "logits/rejected": -1.186861515045166, "logps/chosen": -79.13729858398438, "logps/rejected": -81.89388275146484, "loss": 0.0565, "rewards/accuracies": 1.0, "rewards/chosen": 6.174069404602051, "rewards/margins": 2.8048532009124756, "rewards/rejected": 3.369216203689575, "step": 1823 }, { "epoch": 0.4, "learning_rate": 9.245971470874477e-06, "logits/chosen": -1.4763458967208862, "logits/rejected": -1.4856317043304443, "logps/chosen": -52.33563232421875, "logps/rejected": -80.41578674316406, "loss": 2.4684, "rewards/accuracies": 0.0, "rewards/chosen": 1.4606735706329346, "rewards/margins": -4.89105224609375, "rewards/rejected": 6.351726055145264, "step": 1824 }, { "epoch": 0.4, "learning_rate": 9.245024704527517e-06, "logits/chosen": -1.3951574563980103, "logits/rejected": -1.3294153213500977, "logps/chosen": -24.4570255279541, "logps/rejected": -19.542909622192383, "loss": 0.2933, "rewards/accuracies": 1.0, "rewards/chosen": 1.2815603017807007, "rewards/margins": 0.370064914226532, "rewards/rejected": 0.9114953875541687, "step": 1825 }, { "epoch": 0.4, "learning_rate": 9.244077392705085e-06, "logits/chosen": -1.5324903726577759, "logits/rejected": -1.3823856115341187, "logps/chosen": -135.2493438720703, "logps/rejected": -253.38125610351562, "loss": 0.2523, "rewards/accuracies": 1.0, "rewards/chosen": 8.305335998535156, "rewards/margins": 0.42232179641723633, "rewards/rejected": 7.88301420211792, "step": 1826 }, { "epoch": 0.4, "learning_rate": 9.243129535528909e-06, "logits/chosen": -1.634866714477539, "logits/rejected": -1.6198722124099731, "logps/chosen": -95.36285400390625, "logps/rejected": -66.4131851196289, "loss": 0.0611, "rewards/accuracies": 1.0, "rewards/chosen": 6.305016994476318, "rewards/margins": 2.1012425422668457, "rewards/rejected": 4.203774452209473, "step": 1827 }, { "epoch": 0.4, "learning_rate": 9.242181133120791e-06, "logits/chosen": -1.6394002437591553, "logits/rejected": -1.5758826732635498, "logps/chosen": -74.61961364746094, "logps/rejected": -31.318159103393555, "loss": 2.0063, "rewards/accuracies": 1.0, "rewards/chosen": 4.8474440574646, "rewards/margins": 2.890300750732422, "rewards/rejected": 1.9571431875228882, "step": 1828 }, { "epoch": 0.4, "learning_rate": 9.241232185602594e-06, "logits/chosen": -0.9990746974945068, "logits/rejected": -0.9990746974945068, "logps/chosen": -26.851673126220703, "logps/rejected": -26.851673126220703, "loss": 0.3811, "rewards/accuracies": 0.0, "rewards/chosen": 1.3174892663955688, "rewards/margins": 0.0, "rewards/rejected": 1.3174892663955688, "step": 1829 }, { "epoch": 0.41, "learning_rate": 9.240282693096257e-06, "logits/chosen": -1.2356289625167847, "logits/rejected": -1.2216020822525024, "logps/chosen": -82.54971313476562, "logps/rejected": -87.84494018554688, "loss": 0.4082, "rewards/accuracies": 0.0, "rewards/chosen": 4.791324138641357, "rewards/margins": -0.11090993881225586, "rewards/rejected": 4.902234077453613, "step": 1830 }, { "epoch": 0.41, "learning_rate": 9.239332655723787e-06, "logits/chosen": -1.6116973161697388, "logits/rejected": -1.4064871072769165, "logps/chosen": -144.2909393310547, "logps/rejected": -43.718467712402344, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": 5.489329814910889, "rewards/margins": 3.9506125450134277, "rewards/rejected": 1.538717269897461, "step": 1831 }, { "epoch": 0.41, "learning_rate": 9.238382073607262e-06, "logits/chosen": -1.701415777206421, "logits/rejected": -1.701415777206421, "logps/chosen": -28.142868041992188, "logps/rejected": -28.142868041992188, "loss": 0.4171, "rewards/accuracies": 0.0, "rewards/chosen": 3.8761932849884033, "rewards/margins": 0.0, "rewards/rejected": 3.8761932849884033, "step": 1832 }, { "epoch": 0.41, "learning_rate": 9.237430946868829e-06, "logits/chosen": -1.1910802125930786, "logits/rejected": -1.1910802125930786, "logps/chosen": -14.111074447631836, "logps/rejected": -14.111074447631836, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 2.6840929985046387, "rewards/margins": 0.0, "rewards/rejected": 2.6840929985046387, "step": 1833 }, { "epoch": 0.41, "learning_rate": 9.236479275630707e-06, "logits/chosen": -1.6065398454666138, "logits/rejected": -1.6846611499786377, "logps/chosen": -58.958343505859375, "logps/rejected": -128.44183349609375, "loss": 2.932, "rewards/accuracies": 0.0, "rewards/chosen": 2.8048508167266846, "rewards/margins": -5.579535484313965, "rewards/rejected": 8.38438606262207, "step": 1834 }, { "epoch": 0.41, "learning_rate": 9.235527060015182e-06, "logits/chosen": -1.3085741996765137, "logits/rejected": -1.3085741996765137, "logps/chosen": -18.34896469116211, "logps/rejected": -18.34896469116211, "loss": 0.4529, "rewards/accuracies": 0.0, "rewards/chosen": 2.0264880657196045, "rewards/margins": 0.0, "rewards/rejected": 2.0264880657196045, "step": 1835 }, { "epoch": 0.41, "learning_rate": 9.23457430014461e-06, "logits/chosen": -1.1972439289093018, "logits/rejected": -1.1213231086730957, "logps/chosen": -42.40129089355469, "logps/rejected": -32.47992706298828, "loss": 1.0446, "rewards/accuracies": 1.0, "rewards/chosen": 2.718794345855713, "rewards/margins": 0.43474650382995605, "rewards/rejected": 2.284047842025757, "step": 1836 }, { "epoch": 0.41, "learning_rate": 9.233620996141421e-06, "logits/chosen": -1.4866033792495728, "logits/rejected": -1.5227841138839722, "logps/chosen": -72.71446990966797, "logps/rejected": -86.3123779296875, "loss": 1.5578, "rewards/accuracies": 0.0, "rewards/chosen": 3.9282989501953125, "rewards/margins": -2.9312682151794434, "rewards/rejected": 6.859567165374756, "step": 1837 }, { "epoch": 0.41, "learning_rate": 9.232667148128112e-06, "logits/chosen": -1.693894863128662, "logits/rejected": -1.6856682300567627, "logps/chosen": -143.5959014892578, "logps/rejected": -157.24093627929688, "loss": 1.0842, "rewards/accuracies": 0.0, "rewards/chosen": 7.520927429199219, "rewards/margins": -1.114267349243164, "rewards/rejected": 8.635194778442383, "step": 1838 }, { "epoch": 0.41, "learning_rate": 9.231712756227249e-06, "logits/chosen": -1.1347390413284302, "logits/rejected": -1.089816689491272, "logps/chosen": -14.238508224487305, "logps/rejected": -58.04293441772461, "loss": 2.2483, "rewards/accuracies": 0.0, "rewards/chosen": 0.6639745831489563, "rewards/margins": -4.407189846038818, "rewards/rejected": 5.071164608001709, "step": 1839 }, { "epoch": 0.41, "learning_rate": 9.23075782056147e-06, "logits/chosen": -1.5563570261001587, "logits/rejected": -1.5019302368164062, "logps/chosen": -69.75955200195312, "logps/rejected": -55.158111572265625, "loss": 0.5816, "rewards/accuracies": 0.0, "rewards/chosen": 3.375814199447632, "rewards/margins": -0.7019684314727783, "rewards/rejected": 4.07778263092041, "step": 1840 }, { "epoch": 0.41, "learning_rate": 9.229802341253482e-06, "logits/chosen": -1.375222086906433, "logits/rejected": -1.3787338733673096, "logps/chosen": -39.58979034423828, "logps/rejected": -88.58392333984375, "loss": 0.7517, "rewards/accuracies": 0.0, "rewards/chosen": 2.3873608112335205, "rewards/margins": -1.0941364765167236, "rewards/rejected": 3.481497287750244, "step": 1841 }, { "epoch": 0.41, "learning_rate": 9.22884631842606e-06, "logits/chosen": -1.4639320373535156, "logits/rejected": -1.4409795999526978, "logps/chosen": -35.761444091796875, "logps/rejected": -45.01050567626953, "loss": 0.5965, "rewards/accuracies": 0.0, "rewards/chosen": 2.5826213359832764, "rewards/margins": -0.8240566253662109, "rewards/rejected": 3.4066779613494873, "step": 1842 }, { "epoch": 0.41, "learning_rate": 9.227889752202052e-06, "logits/chosen": -1.1877039670944214, "logits/rejected": -1.2002801895141602, "logps/chosen": -42.80125427246094, "logps/rejected": -55.57651901245117, "loss": 1.1971, "rewards/accuracies": 0.0, "rewards/chosen": 3.9293618202209473, "rewards/margins": -0.3908071517944336, "rewards/rejected": 4.320168972015381, "step": 1843 }, { "epoch": 0.41, "learning_rate": 9.226932642704376e-06, "logits/chosen": -1.4910210371017456, "logits/rejected": -1.5251652002334595, "logps/chosen": -54.05638885498047, "logps/rejected": -83.69110107421875, "loss": 0.681, "rewards/accuracies": 0.0, "rewards/chosen": 1.1541413068771362, "rewards/margins": -1.0360487699508667, "rewards/rejected": 2.190190076828003, "step": 1844 }, { "epoch": 0.41, "learning_rate": 9.225974990056016e-06, "logits/chosen": -1.5648977756500244, "logits/rejected": -1.1156882047653198, "logps/chosen": -82.8666000366211, "logps/rejected": -138.9842529296875, "loss": 1.1048, "rewards/accuracies": 0.0, "rewards/chosen": 4.880063533782959, "rewards/margins": -0.25914573669433594, "rewards/rejected": 5.139209270477295, "step": 1845 }, { "epoch": 0.41, "learning_rate": 9.225016794380027e-06, "logits/chosen": -1.7164369821548462, "logits/rejected": -1.68478524684906, "logps/chosen": -64.31190490722656, "logps/rejected": -71.60517883300781, "loss": 0.1028, "rewards/accuracies": 1.0, "rewards/chosen": 3.8694779872894287, "rewards/margins": 1.4875962734222412, "rewards/rejected": 2.3818817138671875, "step": 1846 }, { "epoch": 0.41, "learning_rate": 9.22405805579954e-06, "logits/chosen": -1.3621805906295776, "logits/rejected": -1.244425654411316, "logps/chosen": -177.511962890625, "logps/rejected": -10.823968887329102, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 5.234399318695068, "rewards/margins": 4.807494640350342, "rewards/rejected": 0.4269048869609833, "step": 1847 }, { "epoch": 0.41, "learning_rate": 9.223098774437744e-06, "logits/chosen": -1.5119825601577759, "logits/rejected": -1.4631729125976562, "logps/chosen": -52.71971130371094, "logps/rejected": -86.01799774169922, "loss": 0.4005, "rewards/accuracies": 1.0, "rewards/chosen": 3.5143189430236816, "rewards/margins": 0.08160400390625, "rewards/rejected": 3.4327149391174316, "step": 1848 }, { "epoch": 0.41, "learning_rate": 9.222138950417908e-06, "logits/chosen": -1.2652454376220703, "logits/rejected": -1.3003311157226562, "logps/chosen": -71.76494598388672, "logps/rejected": -47.722869873046875, "loss": 1.142, "rewards/accuracies": 0.0, "rewards/chosen": 2.5745675563812256, "rewards/margins": -2.1742241382598877, "rewards/rejected": 4.748791694641113, "step": 1849 }, { "epoch": 0.41, "learning_rate": 9.221178583863367e-06, "logits/chosen": -1.3478660583496094, "logits/rejected": -1.3478660583496094, "logps/chosen": -5.0253424644470215, "logps/rejected": -5.0253424644470215, "loss": 1.8394, "rewards/accuracies": 0.0, "rewards/chosen": 1.1560394763946533, "rewards/margins": 0.0, "rewards/rejected": 1.1560394763946533, "step": 1850 }, { "epoch": 0.41, "learning_rate": 9.220217674897524e-06, "logits/chosen": -1.2688409090042114, "logits/rejected": -1.2482515573501587, "logps/chosen": -62.03236770629883, "logps/rejected": -73.319580078125, "loss": 1.4363, "rewards/accuracies": 0.0, "rewards/chosen": 2.2055294513702393, "rewards/margins": -1.0232760906219482, "rewards/rejected": 3.2288055419921875, "step": 1851 }, { "epoch": 0.41, "learning_rate": 9.219256223643857e-06, "logits/chosen": -1.6416077613830566, "logits/rejected": -1.661436676979065, "logps/chosen": -70.14360046386719, "logps/rejected": -120.83210754394531, "loss": 2.9581, "rewards/accuracies": 0.0, "rewards/chosen": 3.0931320190429688, "rewards/margins": -5.115762710571289, "rewards/rejected": 8.208894729614258, "step": 1852 }, { "epoch": 0.41, "learning_rate": 9.218294230225908e-06, "logits/chosen": -1.6112515926361084, "logits/rejected": -1.5685786008834839, "logps/chosen": -58.35802459716797, "logps/rejected": -59.626190185546875, "loss": 1.1311, "rewards/accuracies": 0.0, "rewards/chosen": 2.2018470764160156, "rewards/margins": -1.9216322898864746, "rewards/rejected": 4.12347936630249, "step": 1853 }, { "epoch": 0.41, "learning_rate": 9.217331694767291e-06, "logits/chosen": -1.4904202222824097, "logits/rejected": -1.494861364364624, "logps/chosen": -60.21122741699219, "logps/rejected": -81.18399047851562, "loss": 1.4897, "rewards/accuracies": 0.0, "rewards/chosen": 4.6651482582092285, "rewards/margins": -2.8118886947631836, "rewards/rejected": 7.477036952972412, "step": 1854 }, { "epoch": 0.41, "learning_rate": 9.21636861739169e-06, "logits/chosen": -1.3027327060699463, "logits/rejected": -1.1618057489395142, "logps/chosen": -71.60507202148438, "logps/rejected": -39.102237701416016, "loss": 0.3952, "rewards/accuracies": 1.0, "rewards/chosen": 3.861553907394409, "rewards/margins": 2.2855491638183594, "rewards/rejected": 1.5760048627853394, "step": 1855 }, { "epoch": 0.41, "learning_rate": 9.215404998222856e-06, "logits/chosen": -1.4782681465148926, "logits/rejected": -1.5058410167694092, "logps/chosen": -53.00831604003906, "logps/rejected": -75.15174865722656, "loss": 1.103, "rewards/accuracies": 1.0, "rewards/chosen": 3.861248731613159, "rewards/margins": 1.763420820236206, "rewards/rejected": 2.097827911376953, "step": 1856 }, { "epoch": 0.41, "learning_rate": 9.214440837384612e-06, "logits/chosen": -1.196492075920105, "logits/rejected": -1.3025522232055664, "logps/chosen": -16.85869598388672, "logps/rejected": -51.83359146118164, "loss": 2.0018, "rewards/accuracies": 0.0, "rewards/chosen": 1.1222151517868042, "rewards/margins": -3.9359850883483887, "rewards/rejected": 5.058200359344482, "step": 1857 }, { "epoch": 0.41, "learning_rate": 9.213476135000853e-06, "logits/chosen": -1.535090684890747, "logits/rejected": -1.5710067749023438, "logps/chosen": -45.598968505859375, "logps/rejected": -90.56608581542969, "loss": 0.5766, "rewards/accuracies": 0.0, "rewards/chosen": 5.249609470367432, "rewards/margins": -0.7376909255981445, "rewards/rejected": 5.987300395965576, "step": 1858 }, { "epoch": 0.41, "learning_rate": 9.21251089119554e-06, "logits/chosen": -1.300157904624939, "logits/rejected": -1.3775057792663574, "logps/chosen": -146.83297729492188, "logps/rejected": -134.6844024658203, "loss": 0.6582, "rewards/accuracies": 0.0, "rewards/chosen": 5.963983058929443, "rewards/margins": -0.9743576049804688, "rewards/rejected": 6.938340663909912, "step": 1859 }, { "epoch": 0.41, "learning_rate": 9.211545106092706e-06, "logits/chosen": -1.511522889137268, "logits/rejected": -1.3321948051452637, "logps/chosen": -61.59930419921875, "logps/rejected": -24.34623146057129, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 3.9007766246795654, "rewards/margins": 4.000065803527832, "rewards/rejected": -0.09928913414478302, "step": 1860 }, { "epoch": 0.41, "learning_rate": 9.210578779816449e-06, "logits/chosen": -1.2145668268203735, "logits/rejected": -1.257683277130127, "logps/chosen": -53.501949310302734, "logps/rejected": -64.43064880371094, "loss": 0.8562, "rewards/accuracies": 0.0, "rewards/chosen": 2.4089603424072266, "rewards/margins": -0.30898404121398926, "rewards/rejected": 2.717944383621216, "step": 1861 }, { "epoch": 0.41, "learning_rate": 9.20961191249094e-06, "logits/chosen": -1.0737162828445435, "logits/rejected": -1.1543903350830078, "logps/chosen": -9.556602478027344, "logps/rejected": -58.664207458496094, "loss": 3.2537, "rewards/accuracies": 0.0, "rewards/chosen": 0.29564180970191956, "rewards/margins": -5.81120491027832, "rewards/rejected": 6.106846809387207, "step": 1862 }, { "epoch": 0.41, "learning_rate": 9.208644504240418e-06, "logits/chosen": -1.5933434963226318, "logits/rejected": -1.4773354530334473, "logps/chosen": -65.9074935913086, "logps/rejected": -62.38930892944336, "loss": 0.4536, "rewards/accuracies": 1.0, "rewards/chosen": 5.7525835037231445, "rewards/margins": 1.7770390510559082, "rewards/rejected": 3.9755444526672363, "step": 1863 }, { "epoch": 0.41, "learning_rate": 9.207676555189196e-06, "logits/chosen": -1.8706823587417603, "logits/rejected": -1.747859239578247, "logps/chosen": -76.97151947021484, "logps/rejected": -129.92518615722656, "loss": 0.4756, "rewards/accuracies": 1.0, "rewards/chosen": 8.400805473327637, "rewards/margins": 3.941260814666748, "rewards/rejected": 4.459544658660889, "step": 1864 }, { "epoch": 0.41, "learning_rate": 9.206708065461652e-06, "logits/chosen": -1.6938976049423218, "logits/rejected": -1.7091609239578247, "logps/chosen": -104.63053894042969, "logps/rejected": -188.34002685546875, "loss": 2.3309, "rewards/accuracies": 0.0, "rewards/chosen": 3.9863815307617188, "rewards/margins": -2.880110263824463, "rewards/rejected": 6.866491794586182, "step": 1865 }, { "epoch": 0.41, "learning_rate": 9.205739035182236e-06, "logits/chosen": -1.4299519062042236, "logits/rejected": -1.396761178970337, "logps/chosen": -183.3480682373047, "logps/rejected": -64.63445281982422, "loss": 0.1342, "rewards/accuracies": 1.0, "rewards/chosen": 5.378581523895264, "rewards/margins": 3.095688819885254, "rewards/rejected": 2.2828927040100098, "step": 1866 }, { "epoch": 0.41, "learning_rate": 9.204769464475462e-06, "logits/chosen": -1.5346767902374268, "logits/rejected": -1.4411348104476929, "logps/chosen": -160.33786010742188, "logps/rejected": -60.977783203125, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 6.977771282196045, "rewards/margins": 4.156032085418701, "rewards/rejected": 2.8217391967773438, "step": 1867 }, { "epoch": 0.41, "learning_rate": 9.20379935346592e-06, "logits/chosen": -1.2918089628219604, "logits/rejected": -1.2356882095336914, "logps/chosen": -46.578086853027344, "logps/rejected": -38.27786636352539, "loss": 0.2074, "rewards/accuracies": 1.0, "rewards/chosen": 2.2770607471466064, "rewards/margins": 0.6660102605819702, "rewards/rejected": 1.6110504865646362, "step": 1868 }, { "epoch": 0.41, "learning_rate": 9.202828702278265e-06, "logits/chosen": -1.5386195182800293, "logits/rejected": -1.4089691638946533, "logps/chosen": -127.06484985351562, "logps/rejected": -46.510276794433594, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": 7.39168119430542, "rewards/margins": 3.676543712615967, "rewards/rejected": 3.715137481689453, "step": 1869 }, { "epoch": 0.41, "learning_rate": 9.201857511037228e-06, "logits/chosen": -1.2379648685455322, "logits/rejected": -1.2217450141906738, "logps/chosen": -37.15968322753906, "logps/rejected": -30.086044311523438, "loss": 2.1826, "rewards/accuracies": 0.0, "rewards/chosen": 2.1480038166046143, "rewards/margins": -1.2001311779022217, "rewards/rejected": 3.348134994506836, "step": 1870 }, { "epoch": 0.41, "learning_rate": 9.200885779867601e-06, "logits/chosen": -1.0055692195892334, "logits/rejected": -1.00461745262146, "logps/chosen": -38.953346252441406, "logps/rejected": -35.8202018737793, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 2.641392469406128, "rewards/margins": -0.5901799201965332, "rewards/rejected": 3.231572389602661, "step": 1871 }, { "epoch": 0.41, "learning_rate": 9.199913508894251e-06, "logits/chosen": -1.5092607736587524, "logits/rejected": -1.5331859588623047, "logps/chosen": -119.46990966796875, "logps/rejected": -141.19805908203125, "loss": 1.3921, "rewards/accuracies": 0.0, "rewards/chosen": 4.397555828094482, "rewards/margins": -2.7177062034606934, "rewards/rejected": 7.115262031555176, "step": 1872 }, { "epoch": 0.41, "learning_rate": 9.198940698242108e-06, "logits/chosen": -1.6610758304595947, "logits/rejected": -1.622825026512146, "logps/chosen": -76.51168823242188, "logps/rejected": -71.38196563720703, "loss": 0.949, "rewards/accuracies": 0.0, "rewards/chosen": 4.069021701812744, "rewards/margins": -1.2190260887145996, "rewards/rejected": 5.288047790527344, "step": 1873 }, { "epoch": 0.41, "learning_rate": 9.197967348036182e-06, "logits/chosen": -1.3002468347549438, "logits/rejected": -1.2428615093231201, "logps/chosen": -124.63284301757812, "logps/rejected": -89.26953125, "loss": 0.5058, "rewards/accuracies": 1.0, "rewards/chosen": 5.48591947555542, "rewards/margins": 0.5339784622192383, "rewards/rejected": 4.951941013336182, "step": 1874 }, { "epoch": 0.42, "learning_rate": 9.196993458401544e-06, "logits/chosen": -1.4722886085510254, "logits/rejected": -1.3611606359481812, "logps/chosen": -142.99703979492188, "logps/rejected": -57.27883529663086, "loss": 0.9112, "rewards/accuracies": 0.0, "rewards/chosen": 3.297747850418091, "rewards/margins": -0.36042356491088867, "rewards/rejected": 3.6581714153289795, "step": 1875 }, { "epoch": 0.42, "learning_rate": 9.196019029463335e-06, "logits/chosen": -1.258050799369812, "logits/rejected": -1.258050799369812, "logps/chosen": -33.410728454589844, "logps/rejected": -33.410728454589844, "loss": 2.1732, "rewards/accuracies": 0.0, "rewards/chosen": 3.6687018871307373, "rewards/margins": 0.0, "rewards/rejected": 3.6687018871307373, "step": 1876 }, { "epoch": 0.42, "learning_rate": 9.195044061346767e-06, "logits/chosen": -2.1458985805511475, "logits/rejected": -2.1367955207824707, "logps/chosen": -27.026498794555664, "logps/rejected": -18.98125457763672, "loss": 0.2904, "rewards/accuracies": 1.0, "rewards/chosen": 1.4968420267105103, "rewards/margins": 0.2900289297103882, "rewards/rejected": 1.206813097000122, "step": 1877 }, { "epoch": 0.42, "learning_rate": 9.194068554177123e-06, "logits/chosen": -1.0296881198883057, "logits/rejected": -0.8750803470611572, "logps/chosen": -46.98413848876953, "logps/rejected": -4.820764064788818, "loss": 1.0665, "rewards/accuracies": 1.0, "rewards/chosen": 2.2728874683380127, "rewards/margins": 1.4079298973083496, "rewards/rejected": 0.8649576306343079, "step": 1878 }, { "epoch": 0.42, "learning_rate": 9.19309250807975e-06, "logits/chosen": -1.4175766706466675, "logits/rejected": -1.340920329093933, "logps/chosen": -56.71092987060547, "logps/rejected": -68.37023162841797, "loss": 1.9375, "rewards/accuracies": 0.0, "rewards/chosen": 2.958852529525757, "rewards/margins": -1.1677381992340088, "rewards/rejected": 4.126590728759766, "step": 1879 }, { "epoch": 0.42, "learning_rate": 9.192115923180071e-06, "logits/chosen": -1.208565592765808, "logits/rejected": -1.0906561613082886, "logps/chosen": -46.14645004272461, "logps/rejected": -20.20647430419922, "loss": 0.6538, "rewards/accuracies": 1.0, "rewards/chosen": 3.678834915161133, "rewards/margins": 3.0618135929107666, "rewards/rejected": 0.617021381855011, "step": 1880 }, { "epoch": 0.42, "learning_rate": 9.191138799603574e-06, "logits/chosen": -1.732157588005066, "logits/rejected": -1.7784582376480103, "logps/chosen": -65.72257995605469, "logps/rejected": -123.711669921875, "loss": 1.062, "rewards/accuracies": 0.0, "rewards/chosen": 5.026969909667969, "rewards/margins": -1.9721145629882812, "rewards/rejected": 6.99908447265625, "step": 1881 }, { "epoch": 0.42, "learning_rate": 9.190161137475814e-06, "logits/chosen": -1.3905197381973267, "logits/rejected": -1.317014217376709, "logps/chosen": -91.31139373779297, "logps/rejected": -54.59231185913086, "loss": 1.7572, "rewards/accuracies": 1.0, "rewards/chosen": 6.521531105041504, "rewards/margins": 3.8892452716827393, "rewards/rejected": 2.6322858333587646, "step": 1882 }, { "epoch": 0.42, "learning_rate": 9.189182936922424e-06, "logits/chosen": -1.3619530200958252, "logits/rejected": -1.3535387516021729, "logps/chosen": -61.51873016357422, "logps/rejected": -56.18553924560547, "loss": 0.776, "rewards/accuracies": 0.0, "rewards/chosen": 2.4004287719726562, "rewards/margins": -1.2381417751312256, "rewards/rejected": 3.638570547103882, "step": 1883 }, { "epoch": 0.42, "learning_rate": 9.188204198069096e-06, "logits/chosen": -1.303283452987671, "logits/rejected": -1.2969646453857422, "logps/chosen": -45.6450309753418, "logps/rejected": -81.8983154296875, "loss": 0.4281, "rewards/accuracies": 0.0, "rewards/chosen": 3.5569286346435547, "rewards/margins": -0.15686607360839844, "rewards/rejected": 3.713794708251953, "step": 1884 }, { "epoch": 0.42, "learning_rate": 9.187224921041595e-06, "logits/chosen": -1.382919430732727, "logits/rejected": -1.4003663063049316, "logps/chosen": -76.9938735961914, "logps/rejected": -138.79827880859375, "loss": 0.5887, "rewards/accuracies": 0.0, "rewards/chosen": 5.453933238983154, "rewards/margins": -0.8071446418762207, "rewards/rejected": 6.261077880859375, "step": 1885 }, { "epoch": 0.42, "learning_rate": 9.186245105965758e-06, "logits/chosen": -1.2757054567337036, "logits/rejected": -1.2699270248413086, "logps/chosen": -65.04178619384766, "logps/rejected": -98.93102264404297, "loss": 0.7858, "rewards/accuracies": 0.0, "rewards/chosen": 1.901013970375061, "rewards/margins": -1.3364533185958862, "rewards/rejected": 3.2374672889709473, "step": 1886 }, { "epoch": 0.42, "learning_rate": 9.18526475296749e-06, "logits/chosen": -1.4458378553390503, "logits/rejected": -1.4653840065002441, "logps/chosen": -98.52448272705078, "logps/rejected": -69.5676040649414, "loss": 1.6999, "rewards/accuracies": 0.0, "rewards/chosen": 2.5372750759124756, "rewards/margins": -3.361762762069702, "rewards/rejected": 5.899037837982178, "step": 1887 }, { "epoch": 0.42, "learning_rate": 9.184283862172763e-06, "logits/chosen": -1.5447927713394165, "logits/rejected": -1.5051137208938599, "logps/chosen": -82.8469009399414, "logps/rejected": -88.89952087402344, "loss": 0.4505, "rewards/accuracies": 1.0, "rewards/chosen": 5.145913124084473, "rewards/margins": 1.051389217376709, "rewards/rejected": 4.094523906707764, "step": 1888 }, { "epoch": 0.42, "learning_rate": 9.183302433707616e-06, "logits/chosen": -1.3906747102737427, "logits/rejected": -1.216719150543213, "logps/chosen": -119.24307250976562, "logps/rejected": -45.139137268066406, "loss": 0.2523, "rewards/accuracies": 1.0, "rewards/chosen": 5.477756023406982, "rewards/margins": 1.92753005027771, "rewards/rejected": 3.5502259731292725, "step": 1889 }, { "epoch": 0.42, "learning_rate": 9.182320467698164e-06, "logits/chosen": -1.4269219636917114, "logits/rejected": -1.3070555925369263, "logps/chosen": -124.22706604003906, "logps/rejected": -43.29988098144531, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 6.252850532531738, "rewards/margins": 3.864946126937866, "rewards/rejected": 2.387904405593872, "step": 1890 }, { "epoch": 0.42, "learning_rate": 9.181337964270585e-06, "logits/chosen": -1.1989690065383911, "logits/rejected": -1.1989690065383911, "logps/chosen": -55.99974060058594, "logps/rejected": -55.99974060058594, "loss": 0.8649, "rewards/accuracies": 0.0, "rewards/chosen": 2.688441514968872, "rewards/margins": 0.0, "rewards/rejected": 2.688441514968872, "step": 1891 }, { "epoch": 0.42, "learning_rate": 9.180354923551129e-06, "logits/chosen": -1.411087989807129, "logits/rejected": -1.411087989807129, "logps/chosen": -47.66848373413086, "logps/rejected": -47.66848373413086, "loss": 0.3698, "rewards/accuracies": 0.0, "rewards/chosen": 3.3154590129852295, "rewards/margins": 0.0, "rewards/rejected": 3.3154590129852295, "step": 1892 }, { "epoch": 0.42, "learning_rate": 9.179371345666115e-06, "logits/chosen": -1.2402352094650269, "logits/rejected": -1.2383363246917725, "logps/chosen": -51.77902603149414, "logps/rejected": -53.5150146484375, "loss": 0.1608, "rewards/accuracies": 1.0, "rewards/chosen": 3.381514310836792, "rewards/margins": 1.6752543449401855, "rewards/rejected": 1.7062599658966064, "step": 1893 }, { "epoch": 0.42, "learning_rate": 9.178387230741932e-06, "logits/chosen": -1.4031776189804077, "logits/rejected": -1.368830919265747, "logps/chosen": -62.01682662963867, "logps/rejected": -56.61524963378906, "loss": 0.3704, "rewards/accuracies": 1.0, "rewards/chosen": 5.275604724884033, "rewards/margins": 0.9725041389465332, "rewards/rejected": 4.3031005859375, "step": 1894 }, { "epoch": 0.42, "learning_rate": 9.177402578905032e-06, "logits/chosen": -1.4475833177566528, "logits/rejected": -1.4475833177566528, "logps/chosen": -50.7420768737793, "logps/rejected": -50.7420768737793, "loss": 0.3516, "rewards/accuracies": 0.0, "rewards/chosen": 2.072340726852417, "rewards/margins": 0.0, "rewards/rejected": 2.072340726852417, "step": 1895 }, { "epoch": 0.42, "learning_rate": 9.176417390281944e-06, "logits/chosen": -1.4118268489837646, "logits/rejected": -1.3395271301269531, "logps/chosen": -79.17913818359375, "logps/rejected": -94.36367797851562, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": 5.690609931945801, "rewards/margins": 2.7663323879241943, "rewards/rejected": 2.9242775440216064, "step": 1896 }, { "epoch": 0.42, "learning_rate": 9.17543166499926e-06, "logits/chosen": -1.2181230783462524, "logits/rejected": -1.1928327083587646, "logps/chosen": -35.33190155029297, "logps/rejected": -43.86043930053711, "loss": 0.7138, "rewards/accuracies": 0.0, "rewards/chosen": 2.2437875270843506, "rewards/margins": -0.4552767276763916, "rewards/rejected": 2.699064254760742, "step": 1897 }, { "epoch": 0.42, "learning_rate": 9.174445403183645e-06, "logits/chosen": -1.891931176185608, "logits/rejected": -1.886613368988037, "logps/chosen": -117.66679382324219, "logps/rejected": -111.18310546875, "loss": 1.8748, "rewards/accuracies": 1.0, "rewards/chosen": 10.710182189941406, "rewards/margins": 4.29763650894165, "rewards/rejected": 6.412545680999756, "step": 1898 }, { "epoch": 0.42, "learning_rate": 9.173458604961832e-06, "logits/chosen": -1.4732327461242676, "logits/rejected": -1.3648061752319336, "logps/chosen": -75.00894165039062, "logps/rejected": -56.15057373046875, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 6.754773139953613, "rewards/margins": 4.968273162841797, "rewards/rejected": 1.7864998579025269, "step": 1899 }, { "epoch": 0.42, "learning_rate": 9.17247127046062e-06, "logits/chosen": -1.358460545539856, "logits/rejected": -1.116885781288147, "logps/chosen": -207.97308349609375, "logps/rejected": -51.962093353271484, "loss": 0.1159, "rewards/accuracies": 1.0, "rewards/chosen": 6.395251750946045, "rewards/margins": 4.211638450622559, "rewards/rejected": 2.1836133003234863, "step": 1900 }, { "epoch": 0.42, "learning_rate": 9.17148339980688e-06, "logits/chosen": -1.22205650806427, "logits/rejected": -1.2906798124313354, "logps/chosen": -31.0737247467041, "logps/rejected": -52.898468017578125, "loss": 1.7016, "rewards/accuracies": 0.0, "rewards/chosen": 1.8073710203170776, "rewards/margins": -3.173454761505127, "rewards/rejected": 4.980825901031494, "step": 1901 }, { "epoch": 0.42, "learning_rate": 9.170494993127552e-06, "logits/chosen": -1.6142075061798096, "logits/rejected": -1.570548176765442, "logps/chosen": -145.58604431152344, "logps/rejected": -164.9672088623047, "loss": 2.0209, "rewards/accuracies": 0.0, "rewards/chosen": 6.249580383300781, "rewards/margins": -3.874030113220215, "rewards/rejected": 10.123610496520996, "step": 1902 }, { "epoch": 0.42, "learning_rate": 9.169506050549641e-06, "logits/chosen": -1.4517630338668823, "logits/rejected": -1.399359941482544, "logps/chosen": -61.215370178222656, "logps/rejected": -67.63973236083984, "loss": 0.3005, "rewards/accuracies": 1.0, "rewards/chosen": 5.12713623046875, "rewards/margins": 2.876645565032959, "rewards/rejected": 2.250490665435791, "step": 1903 }, { "epoch": 0.42, "learning_rate": 9.168516572200227e-06, "logits/chosen": -1.4612009525299072, "logits/rejected": -1.403412103652954, "logps/chosen": -41.144710540771484, "logps/rejected": -54.89805603027344, "loss": 0.5022, "rewards/accuracies": 0.0, "rewards/chosen": 2.5956485271453857, "rewards/margins": -0.5437078475952148, "rewards/rejected": 3.1393563747406006, "step": 1904 }, { "epoch": 0.42, "learning_rate": 9.167526558206455e-06, "logits/chosen": -1.3305833339691162, "logits/rejected": -1.3713418245315552, "logps/chosen": -27.195066452026367, "logps/rejected": -64.5818862915039, "loss": 0.8379, "rewards/accuracies": 0.0, "rewards/chosen": 2.394801616668701, "rewards/margins": -1.3926701545715332, "rewards/rejected": 3.7874717712402344, "step": 1905 }, { "epoch": 0.42, "learning_rate": 9.166536008695536e-06, "logits/chosen": -1.4933849573135376, "logits/rejected": -1.4972354173660278, "logps/chosen": -52.087120056152344, "logps/rejected": -61.8641357421875, "loss": 0.8007, "rewards/accuracies": 0.0, "rewards/chosen": 1.7237694263458252, "rewards/margins": -1.3733696937561035, "rewards/rejected": 3.0971391201019287, "step": 1906 }, { "epoch": 0.42, "learning_rate": 9.165544923794758e-06, "logits/chosen": -1.6803115606307983, "logits/rejected": -1.7038938999176025, "logps/chosen": -46.51991271972656, "logps/rejected": -59.55951690673828, "loss": 1.5115, "rewards/accuracies": 0.0, "rewards/chosen": 3.482417345046997, "rewards/margins": -0.16666030883789062, "rewards/rejected": 3.6490776538848877, "step": 1907 }, { "epoch": 0.42, "learning_rate": 9.164553303631472e-06, "logits/chosen": -1.5568437576293945, "logits/rejected": -1.7966282367706299, "logps/chosen": -14.653928756713867, "logps/rejected": -47.81486511230469, "loss": 1.6348, "rewards/accuracies": 0.0, "rewards/chosen": 0.8670217394828796, "rewards/margins": -0.19956225156784058, "rewards/rejected": 1.0665839910507202, "step": 1908 }, { "epoch": 0.42, "learning_rate": 9.163561148333097e-06, "logits/chosen": -1.7020076513290405, "logits/rejected": -1.6322957277297974, "logps/chosen": -84.1310806274414, "logps/rejected": -63.905426025390625, "loss": 0.1006, "rewards/accuracies": 1.0, "rewards/chosen": 5.66460657119751, "rewards/margins": 2.300732374191284, "rewards/rejected": 3.3638741970062256, "step": 1909 }, { "epoch": 0.42, "learning_rate": 9.162568458027122e-06, "logits/chosen": -1.393478274345398, "logits/rejected": -1.4275500774383545, "logps/chosen": -46.68562316894531, "logps/rejected": -97.4100112915039, "loss": 3.7287, "rewards/accuracies": 0.0, "rewards/chosen": 3.126481771469116, "rewards/margins": -6.044981956481934, "rewards/rejected": 9.171463966369629, "step": 1910 }, { "epoch": 0.42, "learning_rate": 9.16157523284111e-06, "logits/chosen": -1.2526865005493164, "logits/rejected": -1.205324649810791, "logps/chosen": -49.28899383544922, "logps/rejected": -43.002777099609375, "loss": 0.6723, "rewards/accuracies": 0.0, "rewards/chosen": 1.5798782110214233, "rewards/margins": -0.9935334920883179, "rewards/rejected": 2.573411703109741, "step": 1911 }, { "epoch": 0.42, "learning_rate": 9.16058147290268e-06, "logits/chosen": -1.3652071952819824, "logits/rejected": -1.443202257156372, "logps/chosen": -67.6981201171875, "logps/rejected": -119.12728881835938, "loss": 4.4862, "rewards/accuracies": 0.0, "rewards/chosen": 3.3229217529296875, "rewards/margins": -5.300475120544434, "rewards/rejected": 8.623396873474121, "step": 1912 }, { "epoch": 0.42, "learning_rate": 9.159587178339535e-06, "logits/chosen": -1.6982767581939697, "logits/rejected": -1.5845881700515747, "logps/chosen": -86.44021606445312, "logps/rejected": -9.88256549835205, "loss": 0.0768, "rewards/accuracies": 1.0, "rewards/chosen": 2.9971892833709717, "rewards/margins": 1.807983636856079, "rewards/rejected": 1.1892056465148926, "step": 1913 }, { "epoch": 0.42, "learning_rate": 9.158592349279439e-06, "logits/chosen": -1.4305734634399414, "logits/rejected": -1.21976900100708, "logps/chosen": -103.62275695800781, "logps/rejected": -37.024539947509766, "loss": 0.218, "rewards/accuracies": 1.0, "rewards/chosen": 5.9846038818359375, "rewards/margins": 6.184932231903076, "rewards/rejected": -0.20032845437526703, "step": 1914 }, { "epoch": 0.42, "learning_rate": 9.157596985850218e-06, "logits/chosen": -1.5881167650222778, "logits/rejected": -1.5594592094421387, "logps/chosen": -105.18968963623047, "logps/rejected": -86.22171020507812, "loss": 0.2045, "rewards/accuracies": 1.0, "rewards/chosen": 5.420533180236816, "rewards/margins": 1.0917201042175293, "rewards/rejected": 4.328813076019287, "step": 1915 }, { "epoch": 0.42, "learning_rate": 9.156601088179785e-06, "logits/chosen": -1.5747019052505493, "logits/rejected": -1.5850030183792114, "logps/chosen": -50.83811950683594, "logps/rejected": -38.50218963623047, "loss": 1.2721, "rewards/accuracies": 0.0, "rewards/chosen": 2.328037977218628, "rewards/margins": -0.3207695484161377, "rewards/rejected": 2.6488075256347656, "step": 1916 }, { "epoch": 0.42, "learning_rate": 9.1556046563961e-06, "logits/chosen": -1.1013872623443604, "logits/rejected": -1.2629765272140503, "logps/chosen": -74.66641235351562, "logps/rejected": -56.909080505371094, "loss": 0.9037, "rewards/accuracies": 0.0, "rewards/chosen": 4.148692607879639, "rewards/margins": -1.623929500579834, "rewards/rejected": 5.772622108459473, "step": 1917 }, { "epoch": 0.42, "learning_rate": 9.154607690627207e-06, "logits/chosen": -1.508522391319275, "logits/rejected": -1.5029020309448242, "logps/chosen": -48.03802490234375, "logps/rejected": -86.59324645996094, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 1.4908859729766846, "rewards/margins": 0.4731835126876831, "rewards/rejected": 1.0177024602890015, "step": 1918 }, { "epoch": 0.42, "learning_rate": 9.153610191001214e-06, "logits/chosen": -1.9265060424804688, "logits/rejected": -1.933613657951355, "logps/chosen": -51.43162536621094, "logps/rejected": -27.9575252532959, "loss": 0.1648, "rewards/accuracies": 1.0, "rewards/chosen": 2.1323082447052, "rewards/margins": 0.9765394926071167, "rewards/rejected": 1.1557687520980835, "step": 1919 }, { "epoch": 0.42, "learning_rate": 9.152612157646297e-06, "logits/chosen": -1.459986925125122, "logits/rejected": -1.4774870872497559, "logps/chosen": -75.05024719238281, "logps/rejected": -90.18739318847656, "loss": 0.7195, "rewards/accuracies": 0.0, "rewards/chosen": 3.345442295074463, "rewards/margins": -1.1644883155822754, "rewards/rejected": 4.509930610656738, "step": 1920 }, { "epoch": 0.43, "learning_rate": 9.1516135906907e-06, "logits/chosen": -1.4825884103775024, "logits/rejected": -1.4063925743103027, "logps/chosen": -49.470848083496094, "logps/rejected": -84.46098327636719, "loss": 2.071, "rewards/accuracies": 0.0, "rewards/chosen": 1.890908122062683, "rewards/margins": -4.0642476081848145, "rewards/rejected": 5.955155849456787, "step": 1921 }, { "epoch": 0.43, "learning_rate": 9.150614490262736e-06, "logits/chosen": -1.5229936838150024, "logits/rejected": -1.5016785860061646, "logps/chosen": -65.25233459472656, "logps/rejected": -72.72015380859375, "loss": 0.8791, "rewards/accuracies": 0.0, "rewards/chosen": 2.6637344360351562, "rewards/margins": -1.4928321838378906, "rewards/rejected": 4.156566619873047, "step": 1922 }, { "epoch": 0.43, "learning_rate": 9.149614856490788e-06, "logits/chosen": -1.8519500494003296, "logits/rejected": -1.8407565355300903, "logps/chosen": -58.19993591308594, "logps/rejected": -30.777563095092773, "loss": 0.3914, "rewards/accuracies": 0.0, "rewards/chosen": 2.8759918212890625, "rewards/margins": -0.06206965446472168, "rewards/rejected": 2.938061475753784, "step": 1923 }, { "epoch": 0.43, "learning_rate": 9.148614689503307e-06, "logits/chosen": -1.4347344636917114, "logits/rejected": -1.3554924726486206, "logps/chosen": -35.31287384033203, "logps/rejected": -19.68279266357422, "loss": 0.1337, "rewards/accuracies": 1.0, "rewards/chosen": 2.862778902053833, "rewards/margins": 2.062356472015381, "rewards/rejected": 0.8004224896430969, "step": 1924 }, { "epoch": 0.43, "learning_rate": 9.147613989428809e-06, "logits/chosen": -1.546480417251587, "logits/rejected": -1.547044277191162, "logps/chosen": -45.43852233886719, "logps/rejected": -30.996875762939453, "loss": 0.4202, "rewards/accuracies": 1.0, "rewards/chosen": 3.3032455444335938, "rewards/margins": 2.3689217567443848, "rewards/rejected": 0.9343239068984985, "step": 1925 }, { "epoch": 0.43, "learning_rate": 9.146612756395888e-06, "logits/chosen": -1.6390801668167114, "logits/rejected": -1.616934061050415, "logps/chosen": -136.01683044433594, "logps/rejected": -92.4529037475586, "loss": 0.7467, "rewards/accuracies": 0.0, "rewards/chosen": 5.19699239730835, "rewards/margins": -0.6114649772644043, "rewards/rejected": 5.808457374572754, "step": 1926 }, { "epoch": 0.43, "learning_rate": 9.145610990533193e-06, "logits/chosen": -1.3821007013320923, "logits/rejected": -1.2912561893463135, "logps/chosen": -57.26065444946289, "logps/rejected": -37.99729537963867, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 3.363004684448242, "rewards/margins": 1.8011596202850342, "rewards/rejected": 1.561845064163208, "step": 1927 }, { "epoch": 0.43, "learning_rate": 9.144608691969452e-06, "logits/chosen": -1.610920786857605, "logits/rejected": -1.4940111637115479, "logps/chosen": -126.32537078857422, "logps/rejected": -130.9186553955078, "loss": 0.3082, "rewards/accuracies": 1.0, "rewards/chosen": 10.31239128112793, "rewards/margins": 6.37556266784668, "rewards/rejected": 3.93682861328125, "step": 1928 }, { "epoch": 0.43, "learning_rate": 9.143605860833459e-06, "logits/chosen": -1.3530880212783813, "logits/rejected": -1.3530880212783813, "logps/chosen": -66.22491455078125, "logps/rejected": -66.22491455078125, "loss": 0.3539, "rewards/accuracies": 0.0, "rewards/chosen": 5.793800354003906, "rewards/margins": 0.0, "rewards/rejected": 5.793800354003906, "step": 1929 }, { "epoch": 0.43, "learning_rate": 9.142602497254071e-06, "logits/chosen": -1.375596046447754, "logits/rejected": -1.3912757635116577, "logps/chosen": -89.079833984375, "logps/rejected": -102.81582641601562, "loss": 1.925, "rewards/accuracies": 0.0, "rewards/chosen": 1.9289993047714233, "rewards/margins": -3.0395307540893555, "rewards/rejected": 4.968530178070068, "step": 1930 }, { "epoch": 0.43, "learning_rate": 9.141598601360225e-06, "logits/chosen": -1.3690006732940674, "logits/rejected": -1.2603340148925781, "logps/chosen": -59.50870132446289, "logps/rejected": -57.17589569091797, "loss": 0.1057, "rewards/accuracies": 1.0, "rewards/chosen": 4.539849758148193, "rewards/margins": 2.4738385677337646, "rewards/rejected": 2.0660111904144287, "step": 1931 }, { "epoch": 0.43, "learning_rate": 9.14059417328091e-06, "logits/chosen": -1.6399271488189697, "logits/rejected": -1.6496230363845825, "logps/chosen": -25.124652862548828, "logps/rejected": -66.70811462402344, "loss": 0.735, "rewards/accuracies": 0.0, "rewards/chosen": 2.048340320587158, "rewards/margins": -0.5767085552215576, "rewards/rejected": 2.625048875808716, "step": 1932 }, { "epoch": 0.43, "learning_rate": 9.139589213145202e-06, "logits/chosen": -1.4373178482055664, "logits/rejected": -1.3806416988372803, "logps/chosen": -123.69430541992188, "logps/rejected": -93.11983489990234, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 7.508824348449707, "rewards/margins": 1.5414729118347168, "rewards/rejected": 5.96735143661499, "step": 1933 }, { "epoch": 0.43, "learning_rate": 9.138583721082229e-06, "logits/chosen": -1.5201278924942017, "logits/rejected": -1.4021800756454468, "logps/chosen": -107.62460327148438, "logps/rejected": -5.653226852416992, "loss": 0.0799, "rewards/accuracies": 1.0, "rewards/chosen": 2.784167528152466, "rewards/margins": 1.783214807510376, "rewards/rejected": 1.0009527206420898, "step": 1934 }, { "epoch": 0.43, "learning_rate": 9.137577697221195e-06, "logits/chosen": -1.7767680883407593, "logits/rejected": -1.4525607824325562, "logps/chosen": -131.4597625732422, "logps/rejected": -121.52798461914062, "loss": 0.1491, "rewards/accuracies": 1.0, "rewards/chosen": 10.095436096191406, "rewards/margins": 3.8140149116516113, "rewards/rejected": 6.281421184539795, "step": 1935 }, { "epoch": 0.43, "learning_rate": 9.136571141691376e-06, "logits/chosen": -1.8613451719284058, "logits/rejected": -1.843201756477356, "logps/chosen": -70.14311218261719, "logps/rejected": -90.01174926757812, "loss": 0.3827, "rewards/accuracies": 0.0, "rewards/chosen": 2.9895615577697754, "rewards/margins": -0.01740717887878418, "rewards/rejected": 3.0069687366485596, "step": 1936 }, { "epoch": 0.43, "learning_rate": 9.135564054622108e-06, "logits/chosen": -1.6549489498138428, "logits/rejected": -1.6418062448501587, "logps/chosen": -96.62010192871094, "logps/rejected": -66.46503448486328, "loss": 0.1916, "rewards/accuracies": 1.0, "rewards/chosen": 5.652470588684082, "rewards/margins": 0.8460841178894043, "rewards/rejected": 4.806386470794678, "step": 1937 }, { "epoch": 0.43, "learning_rate": 9.134556436142801e-06, "logits/chosen": -1.3929567337036133, "logits/rejected": -1.2723205089569092, "logps/chosen": -54.85723876953125, "logps/rejected": -14.692413330078125, "loss": 0.1463, "rewards/accuracies": 1.0, "rewards/chosen": 2.542757511138916, "rewards/margins": 1.0901504755020142, "rewards/rejected": 1.4526070356369019, "step": 1938 }, { "epoch": 0.43, "learning_rate": 9.133548286382932e-06, "logits/chosen": -1.5581213235855103, "logits/rejected": -1.5811796188354492, "logps/chosen": -38.62838363647461, "logps/rejected": -46.76019287109375, "loss": 1.4259, "rewards/accuracies": 1.0, "rewards/chosen": 3.33868145942688, "rewards/margins": 0.04594230651855469, "rewards/rejected": 3.292739152908325, "step": 1939 }, { "epoch": 0.43, "learning_rate": 9.132539605472044e-06, "logits/chosen": -1.32700777053833, "logits/rejected": -1.1417272090911865, "logps/chosen": -149.94338989257812, "logps/rejected": -69.87932586669922, "loss": 0.2682, "rewards/accuracies": 1.0, "rewards/chosen": 4.298314094543457, "rewards/margins": 0.7342875003814697, "rewards/rejected": 3.5640265941619873, "step": 1940 }, { "epoch": 0.43, "learning_rate": 9.131530393539752e-06, "logits/chosen": -1.6747089624404907, "logits/rejected": -1.5228089094161987, "logps/chosen": -115.45285034179688, "logps/rejected": -49.228416442871094, "loss": 0.1856, "rewards/accuracies": 1.0, "rewards/chosen": 6.752313137054443, "rewards/margins": 2.2289915084838867, "rewards/rejected": 4.523321628570557, "step": 1941 }, { "epoch": 0.43, "learning_rate": 9.130520650715735e-06, "logits/chosen": -1.522559404373169, "logits/rejected": -1.5841337442398071, "logps/chosen": -91.76524353027344, "logps/rejected": -165.0645751953125, "loss": 0.2202, "rewards/accuracies": 1.0, "rewards/chosen": 6.393601894378662, "rewards/margins": 0.7243175506591797, "rewards/rejected": 5.669284343719482, "step": 1942 }, { "epoch": 0.43, "learning_rate": 9.129510377129745e-06, "logits/chosen": -1.1794052124023438, "logits/rejected": -1.1794052124023438, "logps/chosen": -54.049659729003906, "logps/rejected": -54.049659729003906, "loss": 1.4713, "rewards/accuracies": 0.0, "rewards/chosen": 2.100252628326416, "rewards/margins": 0.0, "rewards/rejected": 2.100252628326416, "step": 1943 }, { "epoch": 0.43, "learning_rate": 9.128499572911596e-06, "logits/chosen": -1.3944681882858276, "logits/rejected": -1.3944681882858276, "logps/chosen": -29.53155517578125, "logps/rejected": -29.53155517578125, "loss": 0.5218, "rewards/accuracies": 0.0, "rewards/chosen": 3.327840805053711, "rewards/margins": 0.0, "rewards/rejected": 3.327840805053711, "step": 1944 }, { "epoch": 0.43, "learning_rate": 9.12748823819118e-06, "logits/chosen": -1.562319040298462, "logits/rejected": -1.510016918182373, "logps/chosen": -168.28073120117188, "logps/rejected": -87.85714721679688, "loss": 0.1457, "rewards/accuracies": 1.0, "rewards/chosen": 7.1597580909729, "rewards/margins": 1.0913758277893066, "rewards/rejected": 6.068382263183594, "step": 1945 }, { "epoch": 0.43, "learning_rate": 9.126476373098446e-06, "logits/chosen": -1.688307762145996, "logits/rejected": -1.5711499452590942, "logps/chosen": -91.76966094970703, "logps/rejected": -31.126941680908203, "loss": 0.5263, "rewards/accuracies": 0.0, "rewards/chosen": 1.8169959783554077, "rewards/margins": -0.2534555196762085, "rewards/rejected": 2.070451498031616, "step": 1946 }, { "epoch": 0.43, "learning_rate": 9.125463977763417e-06, "logits/chosen": -1.2660049200057983, "logits/rejected": -1.2215839624404907, "logps/chosen": -57.29796600341797, "logps/rejected": -63.00724792480469, "loss": 0.1796, "rewards/accuracies": 1.0, "rewards/chosen": 3.6620118618011475, "rewards/margins": 1.4406464099884033, "rewards/rejected": 2.221365451812744, "step": 1947 }, { "epoch": 0.43, "learning_rate": 9.124451052316185e-06, "logits/chosen": -1.3780771493911743, "logits/rejected": -1.4844493865966797, "logps/chosen": -106.89340209960938, "logps/rejected": -116.72354888916016, "loss": 0.9181, "rewards/accuracies": 0.0, "rewards/chosen": 4.609938144683838, "rewards/margins": -1.4799537658691406, "rewards/rejected": 6.0898919105529785, "step": 1948 }, { "epoch": 0.43, "learning_rate": 9.123437596886909e-06, "logits/chosen": -1.3018460273742676, "logits/rejected": -1.1935113668441772, "logps/chosen": -88.53123474121094, "logps/rejected": -34.32157897949219, "loss": 0.498, "rewards/accuracies": 0.0, "rewards/chosen": 2.1781113147735596, "rewards/margins": -0.5122432708740234, "rewards/rejected": 2.690354585647583, "step": 1949 }, { "epoch": 0.43, "learning_rate": 9.122423611605814e-06, "logits/chosen": -1.2862443923950195, "logits/rejected": -1.2473523616790771, "logps/chosen": -51.763511657714844, "logps/rejected": -68.5047378540039, "loss": 0.6733, "rewards/accuracies": 0.0, "rewards/chosen": 3.8814704418182373, "rewards/margins": -0.30925917625427246, "rewards/rejected": 4.19072961807251, "step": 1950 }, { "epoch": 0.43, "learning_rate": 9.121409096603193e-06, "logits/chosen": -1.5305116176605225, "logits/rejected": -1.5305116176605225, "logps/chosen": -51.8342399597168, "logps/rejected": -51.8342399597168, "loss": 0.3503, "rewards/accuracies": 0.0, "rewards/chosen": 3.1590306758880615, "rewards/margins": 0.0, "rewards/rejected": 3.1590306758880615, "step": 1951 }, { "epoch": 0.43, "learning_rate": 9.120394052009412e-06, "logits/chosen": -1.7409439086914062, "logits/rejected": -1.6531352996826172, "logps/chosen": -100.24259185791016, "logps/rejected": -52.602272033691406, "loss": 0.137, "rewards/accuracies": 1.0, "rewards/chosen": 6.158351898193359, "rewards/margins": 1.6826705932617188, "rewards/rejected": 4.475681304931641, "step": 1952 }, { "epoch": 0.43, "learning_rate": 9.1193784779549e-06, "logits/chosen": -1.2974028587341309, "logits/rejected": -1.239155888557434, "logps/chosen": -45.14373779296875, "logps/rejected": -33.452537536621094, "loss": 0.3525, "rewards/accuracies": 1.0, "rewards/chosen": 2.8844215869903564, "rewards/margins": 0.6576492786407471, "rewards/rejected": 2.2267723083496094, "step": 1953 }, { "epoch": 0.43, "learning_rate": 9.118362374570158e-06, "logits/chosen": -1.7064684629440308, "logits/rejected": -1.6504579782485962, "logps/chosen": -77.7878646850586, "logps/rejected": -49.42680740356445, "loss": 0.8, "rewards/accuracies": 0.0, "rewards/chosen": 3.205590009689331, "rewards/margins": -1.3299076557159424, "rewards/rejected": 4.535497665405273, "step": 1954 }, { "epoch": 0.43, "learning_rate": 9.117345741985749e-06, "logits/chosen": -1.7132021188735962, "logits/rejected": -1.5422825813293457, "logps/chosen": -92.25688934326172, "logps/rejected": -71.94709777832031, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": 8.991331100463867, "rewards/margins": 3.760134220123291, "rewards/rejected": 5.231196880340576, "step": 1955 }, { "epoch": 0.43, "learning_rate": 9.116328580332309e-06, "logits/chosen": -1.2772777080535889, "logits/rejected": -1.2772777080535889, "logps/chosen": -39.5387077331543, "logps/rejected": -39.5387077331543, "loss": 0.3477, "rewards/accuracies": 0.0, "rewards/chosen": 1.222110390663147, "rewards/margins": 0.0, "rewards/rejected": 1.222110390663147, "step": 1956 }, { "epoch": 0.43, "learning_rate": 9.115310889740545e-06, "logits/chosen": -1.262521743774414, "logits/rejected": -1.2550809383392334, "logps/chosen": -32.42430877685547, "logps/rejected": -51.72517395019531, "loss": 0.5532, "rewards/accuracies": 0.0, "rewards/chosen": 2.115255832672119, "rewards/margins": -0.6929657459259033, "rewards/rejected": 2.8082215785980225, "step": 1957 }, { "epoch": 0.43, "learning_rate": 9.114292670341222e-06, "logits/chosen": -1.2357288599014282, "logits/rejected": -1.1544986963272095, "logps/chosen": -36.55575180053711, "logps/rejected": -14.75609016418457, "loss": 0.2122, "rewards/accuracies": 1.0, "rewards/chosen": 2.108985662460327, "rewards/margins": 0.9966905117034912, "rewards/rejected": 1.112295150756836, "step": 1958 }, { "epoch": 0.43, "learning_rate": 9.113273922265183e-06, "logits/chosen": -1.7243050336837769, "logits/rejected": -1.7784398794174194, "logps/chosen": -68.36286926269531, "logps/rejected": -65.62428283691406, "loss": 1.2147, "rewards/accuracies": 1.0, "rewards/chosen": 5.498105525970459, "rewards/margins": 0.007092952728271484, "rewards/rejected": 5.4910125732421875, "step": 1959 }, { "epoch": 0.43, "learning_rate": 9.112254645643332e-06, "logits/chosen": -1.3711947202682495, "logits/rejected": -1.3519937992095947, "logps/chosen": -93.4698486328125, "logps/rejected": -49.863441467285156, "loss": 1.0486, "rewards/accuracies": 0.0, "rewards/chosen": 0.9068710207939148, "rewards/margins": -1.4208405017852783, "rewards/rejected": 2.327711582183838, "step": 1960 }, { "epoch": 0.43, "learning_rate": 9.111234840606647e-06, "logits/chosen": -1.5255658626556396, "logits/rejected": -1.5110297203063965, "logps/chosen": -106.50186157226562, "logps/rejected": -77.32829284667969, "loss": 0.4451, "rewards/accuracies": 0.0, "rewards/chosen": 6.006402492523193, "rewards/margins": -0.35894775390625, "rewards/rejected": 6.365350246429443, "step": 1961 }, { "epoch": 0.43, "learning_rate": 9.110214507286167e-06, "logits/chosen": -1.439176321029663, "logits/rejected": -1.5611860752105713, "logps/chosen": -101.58042907714844, "logps/rejected": -237.71353149414062, "loss": 3.8098, "rewards/accuracies": 0.0, "rewards/chosen": 6.079738140106201, "rewards/margins": -6.101887226104736, "rewards/rejected": 12.181625366210938, "step": 1962 }, { "epoch": 0.43, "learning_rate": 9.109193645813001e-06, "logits/chosen": -1.5989009141921997, "logits/rejected": -1.6065469980239868, "logps/chosen": -47.469207763671875, "logps/rejected": -75.93612670898438, "loss": 1.1881, "rewards/accuracies": 1.0, "rewards/chosen": 1.1611378192901611, "rewards/margins": 0.19285166263580322, "rewards/rejected": 0.9682861566543579, "step": 1963 }, { "epoch": 0.43, "learning_rate": 9.10817225631833e-06, "logits/chosen": -1.4116545915603638, "logits/rejected": -1.3495084047317505, "logps/chosen": -53.19621276855469, "logps/rejected": -14.53435230255127, "loss": 0.8563, "rewards/accuracies": 1.0, "rewards/chosen": 3.3757011890411377, "rewards/margins": 2.967778444290161, "rewards/rejected": 0.4079228341579437, "step": 1964 }, { "epoch": 0.43, "learning_rate": 9.107150338933403e-06, "logits/chosen": -1.435162901878357, "logits/rejected": -1.3913041353225708, "logps/chosen": -52.811275482177734, "logps/rejected": -58.52728271484375, "loss": 0.5268, "rewards/accuracies": 1.0, "rewards/chosen": 3.0561931133270264, "rewards/margins": 0.021654844284057617, "rewards/rejected": 3.0345382690429688, "step": 1965 }, { "epoch": 0.44, "learning_rate": 9.10612789378953e-06, "logits/chosen": -1.2880696058273315, "logits/rejected": -1.2666168212890625, "logps/chosen": -74.58623504638672, "logps/rejected": -58.043209075927734, "loss": 0.459, "rewards/accuracies": 1.0, "rewards/chosen": 2.597548723220825, "rewards/margins": 0.9287738800048828, "rewards/rejected": 1.6687748432159424, "step": 1966 }, { "epoch": 0.44, "learning_rate": 9.105104921018092e-06, "logits/chosen": -1.6383135318756104, "logits/rejected": -1.6448689699172974, "logps/chosen": -128.88160705566406, "logps/rejected": -132.8385467529297, "loss": 0.4896, "rewards/accuracies": 0.0, "rewards/chosen": 7.4918107986450195, "rewards/margins": -0.09696197509765625, "rewards/rejected": 7.588772773742676, "step": 1967 }, { "epoch": 0.44, "learning_rate": 9.10408142075054e-06, "logits/chosen": -1.405896544456482, "logits/rejected": -1.4241583347320557, "logps/chosen": -88.35752868652344, "logps/rejected": -136.39590454101562, "loss": 0.8811, "rewards/accuracies": 0.0, "rewards/chosen": 7.5187530517578125, "rewards/margins": -1.2800521850585938, "rewards/rejected": 8.798805236816406, "step": 1968 }, { "epoch": 0.44, "learning_rate": 9.103057393118392e-06, "logits/chosen": -1.5075713396072388, "logits/rejected": -1.4658201932907104, "logps/chosen": -115.0162124633789, "logps/rejected": -102.15514373779297, "loss": 0.0764, "rewards/accuracies": 1.0, "rewards/chosen": 10.195140838623047, "rewards/margins": 3.754896640777588, "rewards/rejected": 6.440244197845459, "step": 1969 }, { "epoch": 0.44, "learning_rate": 9.102032838253232e-06, "logits/chosen": -1.4649529457092285, "logits/rejected": -1.448184847831726, "logps/chosen": -37.38152313232422, "logps/rejected": -55.36387634277344, "loss": 1.136, "rewards/accuracies": 0.0, "rewards/chosen": 1.3060463666915894, "rewards/margins": -2.0282187461853027, "rewards/rejected": 3.3342652320861816, "step": 1970 }, { "epoch": 0.44, "learning_rate": 9.101007756286713e-06, "logits/chosen": -1.361914038658142, "logits/rejected": -1.329492449760437, "logps/chosen": -85.01481628417969, "logps/rejected": -44.48216247558594, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": 2.1743881702423096, "rewards/margins": -1.0988266468048096, "rewards/rejected": 3.273214817047119, "step": 1971 }, { "epoch": 0.44, "learning_rate": 9.099982147350558e-06, "logits/chosen": -1.5428138971328735, "logits/rejected": -1.551369071006775, "logps/chosen": -51.033409118652344, "logps/rejected": -52.9848747253418, "loss": 1.7044, "rewards/accuracies": 0.0, "rewards/chosen": 2.6984550952911377, "rewards/margins": -0.9795176982879639, "rewards/rejected": 3.6779727935791016, "step": 1972 }, { "epoch": 0.44, "learning_rate": 9.098956011576552e-06, "logits/chosen": -1.3592218160629272, "logits/rejected": -1.3592218160629272, "logps/chosen": -29.288089752197266, "logps/rejected": -29.288089752197266, "loss": 0.3784, "rewards/accuracies": 0.0, "rewards/chosen": 0.9679611325263977, "rewards/margins": 0.0, "rewards/rejected": 0.9679611325263977, "step": 1973 }, { "epoch": 0.44, "learning_rate": 9.097929349096551e-06, "logits/chosen": -1.4166202545166016, "logits/rejected": -1.4466097354888916, "logps/chosen": -66.30315399169922, "logps/rejected": -94.88156127929688, "loss": 0.228, "rewards/accuracies": 1.0, "rewards/chosen": 3.1680397987365723, "rewards/margins": 0.5489921569824219, "rewards/rejected": 2.6190476417541504, "step": 1974 }, { "epoch": 0.44, "learning_rate": 9.09690216004248e-06, "logits/chosen": -1.9337444305419922, "logits/rejected": -1.8930453062057495, "logps/chosen": -83.11953735351562, "logps/rejected": -109.70988464355469, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 5.9556732177734375, "rewards/margins": 3.9249908924102783, "rewards/rejected": 2.030682325363159, "step": 1975 }, { "epoch": 0.44, "learning_rate": 9.09587444454633e-06, "logits/chosen": -1.5215169191360474, "logits/rejected": -1.4534822702407837, "logps/chosen": -89.22840118408203, "logps/rejected": -25.39101219177246, "loss": 0.8449, "rewards/accuracies": 1.0, "rewards/chosen": 2.6023788452148438, "rewards/margins": 1.4965531826019287, "rewards/rejected": 1.105825662612915, "step": 1976 }, { "epoch": 0.44, "learning_rate": 9.094846202740162e-06, "logits/chosen": -1.618772268295288, "logits/rejected": -1.558789610862732, "logps/chosen": -102.43883514404297, "logps/rejected": -52.150611877441406, "loss": 1.4832, "rewards/accuracies": 1.0, "rewards/chosen": 5.154163360595703, "rewards/margins": 1.7767333984375, "rewards/rejected": 3.377429962158203, "step": 1977 }, { "epoch": 0.44, "learning_rate": 9.0938174347561e-06, "logits/chosen": -1.8665001392364502, "logits/rejected": -1.8422573804855347, "logps/chosen": -75.40120697021484, "logps/rejected": -65.19953918457031, "loss": 1.2106, "rewards/accuracies": 0.0, "rewards/chosen": 2.0282485485076904, "rewards/margins": -1.9810569286346436, "rewards/rejected": 4.009305477142334, "step": 1978 }, { "epoch": 0.44, "learning_rate": 9.092788140726338e-06, "logits/chosen": -1.3615792989730835, "logits/rejected": -1.3285489082336426, "logps/chosen": -29.46541976928711, "logps/rejected": -25.474117279052734, "loss": 0.2074, "rewards/accuracies": 1.0, "rewards/chosen": 2.631981372833252, "rewards/margins": 0.7833298444747925, "rewards/rejected": 1.8486515283584595, "step": 1979 }, { "epoch": 0.44, "learning_rate": 9.091758320783139e-06, "logits/chosen": -1.405978798866272, "logits/rejected": -1.407286524772644, "logps/chosen": -25.244407653808594, "logps/rejected": -42.399131774902344, "loss": 0.4221, "rewards/accuracies": 0.0, "rewards/chosen": 3.5153729915618896, "rewards/margins": -0.2762126922607422, "rewards/rejected": 3.791585683822632, "step": 1980 }, { "epoch": 0.44, "learning_rate": 9.090727975058833e-06, "logits/chosen": -1.262160062789917, "logits/rejected": -1.2254328727722168, "logps/chosen": -99.93541717529297, "logps/rejected": -65.7652587890625, "loss": 1.5847, "rewards/accuracies": 0.0, "rewards/chosen": 2.706346273422241, "rewards/margins": -1.9637715816497803, "rewards/rejected": 4.6701178550720215, "step": 1981 }, { "epoch": 0.44, "learning_rate": 9.089697103685815e-06, "logits/chosen": -1.462603211402893, "logits/rejected": -1.2657904624938965, "logps/chosen": -104.55636596679688, "logps/rejected": -66.70315551757812, "loss": 0.0952, "rewards/accuracies": 1.0, "rewards/chosen": 6.290469646453857, "rewards/margins": 3.1254160404205322, "rewards/rejected": 3.165053606033325, "step": 1982 }, { "epoch": 0.44, "learning_rate": 9.08866570679655e-06, "logits/chosen": -1.3575142621994019, "logits/rejected": -1.40591561794281, "logps/chosen": -45.24420928955078, "logps/rejected": -71.39234161376953, "loss": 1.8191, "rewards/accuracies": 0.0, "rewards/chosen": 2.631786346435547, "rewards/margins": -3.296539306640625, "rewards/rejected": 5.928325653076172, "step": 1983 }, { "epoch": 0.44, "learning_rate": 9.087633784523574e-06, "logits/chosen": -1.3560470342636108, "logits/rejected": -1.249536156654358, "logps/chosen": -46.78071594238281, "logps/rejected": -52.626976013183594, "loss": 1.0013, "rewards/accuracies": 0.0, "rewards/chosen": 1.941016435623169, "rewards/margins": -0.8994796276092529, "rewards/rejected": 2.840496063232422, "step": 1984 }, { "epoch": 0.44, "learning_rate": 9.08660133699948e-06, "logits/chosen": -1.435017466545105, "logits/rejected": -1.4068539142608643, "logps/chosen": -36.112430572509766, "logps/rejected": -62.85826110839844, "loss": 0.4483, "rewards/accuracies": 0.0, "rewards/chosen": 3.16288423538208, "rewards/margins": -0.23934197425842285, "rewards/rejected": 3.402226209640503, "step": 1985 }, { "epoch": 0.44, "learning_rate": 9.085568364356939e-06, "logits/chosen": -1.35634446144104, "logits/rejected": -1.321632981300354, "logps/chosen": -37.397865295410156, "logps/rejected": -44.84056854248047, "loss": 0.3185, "rewards/accuracies": 1.0, "rewards/chosen": 2.653036594390869, "rewards/margins": 0.13474583625793457, "rewards/rejected": 2.5182907581329346, "step": 1986 }, { "epoch": 0.44, "learning_rate": 9.084534866728683e-06, "logits/chosen": -1.446379542350769, "logits/rejected": -1.4116363525390625, "logps/chosen": -74.54557037353516, "logps/rejected": -60.54290771484375, "loss": 1.0591, "rewards/accuracies": 0.0, "rewards/chosen": 3.3114891052246094, "rewards/margins": -1.126166820526123, "rewards/rejected": 4.437655925750732, "step": 1987 }, { "epoch": 0.44, "learning_rate": 9.083500844247517e-06, "logits/chosen": -1.3290777206420898, "logits/rejected": -1.28465735912323, "logps/chosen": -42.935760498046875, "logps/rejected": -44.731903076171875, "loss": 0.7591, "rewards/accuracies": 0.0, "rewards/chosen": 1.6499016284942627, "rewards/margins": -1.2690634727478027, "rewards/rejected": 2.9189651012420654, "step": 1988 }, { "epoch": 0.44, "learning_rate": 9.082466297046308e-06, "logits/chosen": -1.5989186763763428, "logits/rejected": -1.5865155458450317, "logps/chosen": -50.041236877441406, "logps/rejected": -78.85989379882812, "loss": 0.5548, "rewards/accuracies": 1.0, "rewards/chosen": 2.5780296325683594, "rewards/margins": 0.2553253173828125, "rewards/rejected": 2.322704315185547, "step": 1989 }, { "epoch": 0.44, "learning_rate": 9.081431225257994e-06, "logits/chosen": -1.5457091331481934, "logits/rejected": -1.587140679359436, "logps/chosen": -103.20816040039062, "logps/rejected": -116.56257629394531, "loss": 1.3252, "rewards/accuracies": 0.0, "rewards/chosen": 6.438334941864014, "rewards/margins": -0.6451826095581055, "rewards/rejected": 7.083517551422119, "step": 1990 }, { "epoch": 0.44, "learning_rate": 9.08039562901558e-06, "logits/chosen": -1.7088136672973633, "logits/rejected": -1.674991488456726, "logps/chosen": -79.05119323730469, "logps/rejected": -91.83377838134766, "loss": 1.0448, "rewards/accuracies": 0.0, "rewards/chosen": 1.9161590337753296, "rewards/margins": -0.4554802179336548, "rewards/rejected": 2.3716392517089844, "step": 1991 }, { "epoch": 0.44, "learning_rate": 9.079359508452138e-06, "logits/chosen": -1.5105743408203125, "logits/rejected": -1.5818859338760376, "logps/chosen": -142.9268798828125, "logps/rejected": -140.55508422851562, "loss": 1.1361, "rewards/accuracies": 0.0, "rewards/chosen": 6.562260627746582, "rewards/margins": -1.2482786178588867, "rewards/rejected": 7.810539245605469, "step": 1992 }, { "epoch": 0.44, "learning_rate": 9.078322863700803e-06, "logits/chosen": -1.4340801239013672, "logits/rejected": -1.241878867149353, "logps/chosen": -42.793495178222656, "logps/rejected": -72.64964294433594, "loss": 0.4079, "rewards/accuracies": 0.0, "rewards/chosen": 2.136171817779541, "rewards/margins": -0.23061823844909668, "rewards/rejected": 2.3667900562286377, "step": 1993 }, { "epoch": 0.44, "learning_rate": 9.077285694894786e-06, "logits/chosen": -1.328621745109558, "logits/rejected": -1.3345705270767212, "logps/chosen": -32.60906219482422, "logps/rejected": -68.14157104492188, "loss": 1.2133, "rewards/accuracies": 0.0, "rewards/chosen": 0.9802643060684204, "rewards/margins": -0.6158103942871094, "rewards/rejected": 1.5960747003555298, "step": 1994 }, { "epoch": 0.44, "learning_rate": 9.076248002167357e-06, "logits/chosen": -1.5930641889572144, "logits/rejected": -1.3282091617584229, "logps/chosen": -83.53749084472656, "logps/rejected": -123.32412719726562, "loss": 1.1295, "rewards/accuracies": 0.0, "rewards/chosen": 1.3863509893417358, "rewards/margins": -1.3536819219589233, "rewards/rejected": 2.740032911300659, "step": 1995 }, { "epoch": 0.44, "learning_rate": 9.07520978565186e-06, "logits/chosen": -1.9181184768676758, "logits/rejected": -1.9461650848388672, "logps/chosen": -81.80836486816406, "logps/rejected": -92.24476623535156, "loss": 0.7708, "rewards/accuracies": 0.0, "rewards/chosen": 6.16110372543335, "rewards/margins": -1.0480151176452637, "rewards/rejected": 7.209118843078613, "step": 1996 }, { "epoch": 0.44, "learning_rate": 9.074171045481701e-06, "logits/chosen": -1.4315414428710938, "logits/rejected": -1.412489414215088, "logps/chosen": -44.65672302246094, "logps/rejected": -38.298885345458984, "loss": 1.2988, "rewards/accuracies": 1.0, "rewards/chosen": 2.338918447494507, "rewards/margins": 0.8023160696029663, "rewards/rejected": 1.5366023778915405, "step": 1997 }, { "epoch": 0.44, "learning_rate": 9.073131781790358e-06, "logits/chosen": -1.5266529321670532, "logits/rejected": -1.5344046354293823, "logps/chosen": -31.90201187133789, "logps/rejected": -36.666934967041016, "loss": 0.2901, "rewards/accuracies": 1.0, "rewards/chosen": 2.833211898803711, "rewards/margins": 0.2560856342315674, "rewards/rejected": 2.5771262645721436, "step": 1998 }, { "epoch": 0.44, "learning_rate": 9.072091994711372e-06, "logits/chosen": -1.3206926584243774, "logits/rejected": -1.179214596748352, "logps/chosen": -74.097900390625, "logps/rejected": -42.73389434814453, "loss": 0.6454, "rewards/accuracies": 0.0, "rewards/chosen": 3.879936933517456, "rewards/margins": -0.7629668712615967, "rewards/rejected": 4.642903804779053, "step": 1999 }, { "epoch": 0.44, "learning_rate": 9.071051684378352e-06, "logits/chosen": -1.8495553731918335, "logits/rejected": -1.7422125339508057, "logps/chosen": -131.570068359375, "logps/rejected": -95.77565002441406, "loss": 1.0394, "rewards/accuracies": 1.0, "rewards/chosen": 7.343287944793701, "rewards/margins": 1.521585464477539, "rewards/rejected": 5.821702480316162, "step": 2000 }, { "epoch": 0.44, "learning_rate": 9.07001085092498e-06, "logits/chosen": -1.5285217761993408, "logits/rejected": -1.2612744569778442, "logps/chosen": -129.6092071533203, "logps/rejected": -59.20713424682617, "loss": 0.5719, "rewards/accuracies": 1.0, "rewards/chosen": 5.30462646484375, "rewards/margins": 5.4426798820495605, "rewards/rejected": -0.1380535215139389, "step": 2001 }, { "epoch": 0.44, "learning_rate": 9.068969494484996e-06, "logits/chosen": -1.5586386919021606, "logits/rejected": -1.4366408586502075, "logps/chosen": -60.9068717956543, "logps/rejected": -44.919803619384766, "loss": 0.2606, "rewards/accuracies": 1.0, "rewards/chosen": 4.33790922164917, "rewards/margins": 0.6951072216033936, "rewards/rejected": 3.6428020000457764, "step": 2002 }, { "epoch": 0.44, "learning_rate": 9.067927615192214e-06, "logits/chosen": -1.3767167329788208, "logits/rejected": -1.2986518144607544, "logps/chosen": -102.14814758300781, "logps/rejected": -87.48648071289062, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": 6.332785129547119, "rewards/margins": 1.9833345413208008, "rewards/rejected": 4.349450588226318, "step": 2003 }, { "epoch": 0.44, "learning_rate": 9.066885213180512e-06, "logits/chosen": -1.2881220579147339, "logits/rejected": -1.2881220579147339, "logps/chosen": -11.77790641784668, "logps/rejected": -11.77790641784668, "loss": 1.6139, "rewards/accuracies": 0.0, "rewards/chosen": 0.9181346893310547, "rewards/margins": 0.0, "rewards/rejected": 0.9181346893310547, "step": 2004 }, { "epoch": 0.44, "learning_rate": 9.065842288583838e-06, "logits/chosen": -1.6904163360595703, "logits/rejected": -1.6643807888031006, "logps/chosen": -103.86532592773438, "logps/rejected": -89.16712188720703, "loss": 1.6991, "rewards/accuracies": 0.0, "rewards/chosen": 5.044273376464844, "rewards/margins": -0.728848934173584, "rewards/rejected": 5.773122310638428, "step": 2005 }, { "epoch": 0.44, "learning_rate": 9.064798841536203e-06, "logits/chosen": -1.6046775579452515, "logits/rejected": -1.5588527917861938, "logps/chosen": -96.83287048339844, "logps/rejected": -126.40861511230469, "loss": 0.3367, "rewards/accuracies": 1.0, "rewards/chosen": 5.79672384262085, "rewards/margins": 0.19942617416381836, "rewards/rejected": 5.597297668457031, "step": 2006 }, { "epoch": 0.44, "learning_rate": 9.063754872171686e-06, "logits/chosen": -1.354841709136963, "logits/rejected": -1.3713566064834595, "logps/chosen": -58.5107307434082, "logps/rejected": -82.42804718017578, "loss": 0.091, "rewards/accuracies": 1.0, "rewards/chosen": 4.251199722290039, "rewards/margins": 1.8051722049713135, "rewards/rejected": 2.4460275173187256, "step": 2007 }, { "epoch": 0.44, "learning_rate": 9.062710380624439e-06, "logits/chosen": -1.4330955743789673, "logits/rejected": -1.4127012491226196, "logps/chosen": -147.6229248046875, "logps/rejected": -210.4402618408203, "loss": 0.6682, "rewards/accuracies": 0.0, "rewards/chosen": 7.393812656402588, "rewards/margins": -0.8933014869689941, "rewards/rejected": 8.287114143371582, "step": 2008 }, { "epoch": 0.44, "learning_rate": 9.061665367028676e-06, "logits/chosen": -1.4644359350204468, "logits/rejected": -1.3555443286895752, "logps/chosen": -100.77131652832031, "logps/rejected": -55.64457321166992, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 6.092817783355713, "rewards/margins": 3.704047918319702, "rewards/rejected": 2.3887698650360107, "step": 2009 }, { "epoch": 0.44, "learning_rate": 9.060619831518676e-06, "logits/chosen": -1.1930646896362305, "logits/rejected": -1.4709076881408691, "logps/chosen": -35.730812072753906, "logps/rejected": -31.412921905517578, "loss": 2.568, "rewards/accuracies": 1.0, "rewards/chosen": 3.1199989318847656, "rewards/margins": 1.3863941431045532, "rewards/rejected": 1.7336047887802124, "step": 2010 }, { "epoch": 0.45, "learning_rate": 9.05957377422879e-06, "logits/chosen": -1.1830321550369263, "logits/rejected": -1.13554048538208, "logps/chosen": -75.41625213623047, "logps/rejected": -56.8946647644043, "loss": 0.8492, "rewards/accuracies": 0.0, "rewards/chosen": 3.8050270080566406, "rewards/margins": -1.477107048034668, "rewards/rejected": 5.282134056091309, "step": 2011 }, { "epoch": 0.45, "learning_rate": 9.058527195293431e-06, "logits/chosen": -1.9502112865447998, "logits/rejected": -1.8997987508773804, "logps/chosen": -57.66180419921875, "logps/rejected": -87.86012268066406, "loss": 1.6, "rewards/accuracies": 0.0, "rewards/chosen": 4.4410576820373535, "rewards/margins": -1.643294334411621, "rewards/rejected": 6.084352016448975, "step": 2012 }, { "epoch": 0.45, "learning_rate": 9.057480094847085e-06, "logits/chosen": -1.1838107109069824, "logits/rejected": -1.184464931488037, "logps/chosen": -32.45642852783203, "logps/rejected": -91.6820297241211, "loss": 2.5807, "rewards/accuracies": 0.0, "rewards/chosen": 3.0160062313079834, "rewards/margins": -3.5662882328033447, "rewards/rejected": 6.582294464111328, "step": 2013 }, { "epoch": 0.45, "learning_rate": 9.056432473024302e-06, "logits/chosen": -1.573198676109314, "logits/rejected": -1.4722955226898193, "logps/chosen": -116.17242431640625, "logps/rejected": -111.29266357421875, "loss": 0.6132, "rewards/accuracies": 1.0, "rewards/chosen": 6.252606391906738, "rewards/margins": 5.0198869705200195, "rewards/rejected": 1.2327194213867188, "step": 2014 }, { "epoch": 0.45, "learning_rate": 9.055384329959695e-06, "logits/chosen": -1.5231292247772217, "logits/rejected": -1.5265065431594849, "logps/chosen": -97.86363220214844, "logps/rejected": -125.54114532470703, "loss": 0.6153, "rewards/accuracies": 0.0, "rewards/chosen": 2.3968276977539062, "rewards/margins": -0.840827226638794, "rewards/rejected": 3.2376549243927, "step": 2015 }, { "epoch": 0.45, "learning_rate": 9.054335665787952e-06, "logits/chosen": -1.338166356086731, "logits/rejected": -1.2919644117355347, "logps/chosen": -79.41039276123047, "logps/rejected": -38.92420196533203, "loss": 0.1347, "rewards/accuracies": 1.0, "rewards/chosen": 4.452841281890869, "rewards/margins": 1.2321205139160156, "rewards/rejected": 3.2207207679748535, "step": 2016 }, { "epoch": 0.45, "learning_rate": 9.053286480643822e-06, "logits/chosen": -1.1853091716766357, "logits/rejected": -1.1853091716766357, "logps/chosen": -86.11395263671875, "logps/rejected": -86.11395263671875, "loss": 0.3477, "rewards/accuracies": 0.0, "rewards/chosen": 7.410912990570068, "rewards/margins": 0.0, "rewards/rejected": 7.410912990570068, "step": 2017 }, { "epoch": 0.45, "learning_rate": 9.052236774662123e-06, "logits/chosen": -1.3962833881378174, "logits/rejected": -1.2070029973983765, "logps/chosen": -112.87677001953125, "logps/rejected": -89.79995727539062, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": 6.782275676727295, "rewards/margins": 2.610043525695801, "rewards/rejected": 4.172232151031494, "step": 2018 }, { "epoch": 0.45, "learning_rate": 9.051186547977739e-06, "logits/chosen": -1.4077959060668945, "logits/rejected": -1.2410756349563599, "logps/chosen": -72.04598236083984, "logps/rejected": -44.75388717651367, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": 4.53218936920166, "rewards/margins": 2.9926068782806396, "rewards/rejected": 1.5395824909210205, "step": 2019 }, { "epoch": 0.45, "learning_rate": 9.050135800725623e-06, "logits/chosen": -1.4818124771118164, "logits/rejected": -1.4025242328643799, "logps/chosen": -60.65916442871094, "logps/rejected": -179.41336059570312, "loss": 1.4182, "rewards/accuracies": 0.0, "rewards/chosen": 6.239991664886475, "rewards/margins": -1.4650497436523438, "rewards/rejected": 7.705041408538818, "step": 2020 }, { "epoch": 0.45, "learning_rate": 9.049084533040794e-06, "logits/chosen": -1.6575531959533691, "logits/rejected": -1.7479816675186157, "logps/chosen": -84.20008850097656, "logps/rejected": -109.69058227539062, "loss": 0.8281, "rewards/accuracies": 0.0, "rewards/chosen": 4.706254482269287, "rewards/margins": -1.4172611236572266, "rewards/rejected": 6.123515605926514, "step": 2021 }, { "epoch": 0.45, "learning_rate": 9.048032745058335e-06, "logits/chosen": -1.6007933616638184, "logits/rejected": -1.5385305881500244, "logps/chosen": -103.6347427368164, "logps/rejected": -70.72467041015625, "loss": 1.0179, "rewards/accuracies": 1.0, "rewards/chosen": 7.115238189697266, "rewards/margins": 5.261685371398926, "rewards/rejected": 1.853553056716919, "step": 2022 }, { "epoch": 0.45, "learning_rate": 9.0469804369134e-06, "logits/chosen": -1.2549067735671997, "logits/rejected": -1.1947150230407715, "logps/chosen": -54.43666458129883, "logps/rejected": -32.39519119262695, "loss": 0.1435, "rewards/accuracies": 1.0, "rewards/chosen": 2.7536182403564453, "rewards/margins": 1.2332035303115845, "rewards/rejected": 1.5204147100448608, "step": 2023 }, { "epoch": 0.45, "learning_rate": 9.045927608741207e-06, "logits/chosen": -1.2448012828826904, "logits/rejected": -1.2797714471817017, "logps/chosen": -125.45238494873047, "logps/rejected": -85.94900512695312, "loss": 1.9965, "rewards/accuracies": 1.0, "rewards/chosen": 4.988033294677734, "rewards/margins": 0.47522640228271484, "rewards/rejected": 4.5128068923950195, "step": 2024 }, { "epoch": 0.45, "learning_rate": 9.044874260677043e-06, "logits/chosen": -1.427948236465454, "logits/rejected": -1.2943429946899414, "logps/chosen": -43.89495086669922, "logps/rejected": -6.923564910888672, "loss": 1.1164, "rewards/accuracies": 1.0, "rewards/chosen": 1.8516327142715454, "rewards/margins": 0.6201597452163696, "rewards/rejected": 1.2314729690551758, "step": 2025 }, { "epoch": 0.45, "learning_rate": 9.043820392856259e-06, "logits/chosen": -1.7624032497406006, "logits/rejected": -1.6874008178710938, "logps/chosen": -80.86647033691406, "logps/rejected": -72.0289306640625, "loss": 0.4195, "rewards/accuracies": 0.0, "rewards/chosen": 3.0555121898651123, "rewards/margins": -0.21561050415039062, "rewards/rejected": 3.271122694015503, "step": 2026 }, { "epoch": 0.45, "learning_rate": 9.042766005414278e-06, "logits/chosen": -1.3667995929718018, "logits/rejected": -1.3515762090682983, "logps/chosen": -48.85912322998047, "logps/rejected": -39.04689025878906, "loss": 0.8954, "rewards/accuracies": 0.0, "rewards/chosen": 1.3340564966201782, "rewards/margins": -1.5397502183914185, "rewards/rejected": 2.8738067150115967, "step": 2027 }, { "epoch": 0.45, "learning_rate": 9.041711098486583e-06, "logits/chosen": -1.4230924844741821, "logits/rejected": -1.4245736598968506, "logps/chosen": -57.95454406738281, "logps/rejected": -23.03103256225586, "loss": 2.488, "rewards/accuracies": 1.0, "rewards/chosen": 3.124005079269409, "rewards/margins": 0.14128947257995605, "rewards/rejected": 2.982715606689453, "step": 2028 }, { "epoch": 0.45, "learning_rate": 9.040655672208727e-06, "logits/chosen": -1.5709205865859985, "logits/rejected": -1.5131069421768188, "logps/chosen": -48.98849105834961, "logps/rejected": -60.1876335144043, "loss": 0.3787, "rewards/accuracies": 1.0, "rewards/chosen": 4.857610702514648, "rewards/margins": 1.7728188037872314, "rewards/rejected": 3.084791898727417, "step": 2029 }, { "epoch": 0.45, "learning_rate": 9.03959972671633e-06, "logits/chosen": -1.4385454654693604, "logits/rejected": -1.4385454654693604, "logps/chosen": -30.490915298461914, "logps/rejected": -30.490915298461914, "loss": 0.4953, "rewards/accuracies": 0.0, "rewards/chosen": 2.3071000576019287, "rewards/margins": 0.0, "rewards/rejected": 2.3071000576019287, "step": 2030 }, { "epoch": 0.45, "learning_rate": 9.03854326214508e-06, "logits/chosen": -1.5977321863174438, "logits/rejected": -1.567222237586975, "logps/chosen": -54.561119079589844, "logps/rejected": -60.686668395996094, "loss": 0.6129, "rewards/accuracies": 1.0, "rewards/chosen": 2.8883461952209473, "rewards/margins": 0.792269229888916, "rewards/rejected": 2.0960769653320312, "step": 2031 }, { "epoch": 0.45, "learning_rate": 9.037486278630729e-06, "logits/chosen": -1.4337482452392578, "logits/rejected": -1.2844278812408447, "logps/chosen": -48.2205810546875, "logps/rejected": -31.986812591552734, "loss": 0.2163, "rewards/accuracies": 1.0, "rewards/chosen": 3.7827064990997314, "rewards/margins": 2.096039295196533, "rewards/rejected": 1.6866673231124878, "step": 2032 }, { "epoch": 0.45, "learning_rate": 9.036428776309096e-06, "logits/chosen": -1.7818043231964111, "logits/rejected": -1.7253942489624023, "logps/chosen": -99.58921813964844, "logps/rejected": -98.49369812011719, "loss": 0.3825, "rewards/accuracies": 1.0, "rewards/chosen": 4.940125942230225, "rewards/margins": 1.6671538352966309, "rewards/rejected": 3.2729721069335938, "step": 2033 }, { "epoch": 0.45, "learning_rate": 9.03537075531607e-06, "logits/chosen": -1.6477775573730469, "logits/rejected": -1.5015729665756226, "logps/chosen": -170.71734619140625, "logps/rejected": -122.22331237792969, "loss": 0.3809, "rewards/accuracies": 0.0, "rewards/chosen": 6.540223598480225, "rewards/margins": -0.10903358459472656, "rewards/rejected": 6.649257183074951, "step": 2034 }, { "epoch": 0.45, "learning_rate": 9.034312215787603e-06, "logits/chosen": -1.6634269952774048, "logits/rejected": -1.7167586088180542, "logps/chosen": -82.84661865234375, "logps/rejected": -103.19209289550781, "loss": 1.9749, "rewards/accuracies": 0.0, "rewards/chosen": 3.994960069656372, "rewards/margins": -3.9231550693511963, "rewards/rejected": 7.918115139007568, "step": 2035 }, { "epoch": 0.45, "learning_rate": 9.033253157859715e-06, "logits/chosen": -1.6167880296707153, "logits/rejected": -1.545604944229126, "logps/chosen": -115.28170776367188, "logps/rejected": -45.19788360595703, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 6.752055644989014, "rewards/margins": 4.167937755584717, "rewards/rejected": 2.584117889404297, "step": 2036 }, { "epoch": 0.45, "learning_rate": 9.03219358166849e-06, "logits/chosen": -1.4764553308486938, "logits/rejected": -1.3848557472229004, "logps/chosen": -43.32927322387695, "logps/rejected": -25.70285415649414, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 5.115640640258789, "rewards/margins": 4.385176658630371, "rewards/rejected": 0.7304641604423523, "step": 2037 }, { "epoch": 0.45, "learning_rate": 9.031133487350084e-06, "logits/chosen": -1.2953884601593018, "logits/rejected": -1.3196700811386108, "logps/chosen": -73.73912811279297, "logps/rejected": -157.08697509765625, "loss": 3.4223, "rewards/accuracies": 0.0, "rewards/chosen": 2.9655539989471436, "rewards/margins": -6.354860305786133, "rewards/rejected": 9.320414543151855, "step": 2038 }, { "epoch": 0.45, "learning_rate": 9.030072875040714e-06, "logits/chosen": -1.5715194940567017, "logits/rejected": -1.6190950870513916, "logps/chosen": -82.40708923339844, "logps/rejected": -99.4122543334961, "loss": 1.7388, "rewards/accuracies": 0.0, "rewards/chosen": 2.981671094894409, "rewards/margins": -3.2089760303497314, "rewards/rejected": 6.190647125244141, "step": 2039 }, { "epoch": 0.45, "learning_rate": 9.029011744876669e-06, "logits/chosen": -1.475666880607605, "logits/rejected": -1.329484224319458, "logps/chosen": -128.9252166748047, "logps/rejected": -209.95211791992188, "loss": 2.918, "rewards/accuracies": 0.0, "rewards/chosen": 5.498340129852295, "rewards/margins": -0.8973050117492676, "rewards/rejected": 6.3956451416015625, "step": 2040 }, { "epoch": 0.45, "learning_rate": 9.027950096994299e-06, "logits/chosen": -1.5379631519317627, "logits/rejected": -1.5628275871276855, "logps/chosen": -127.37962341308594, "logps/rejected": -77.97090911865234, "loss": 0.645, "rewards/accuracies": 0.0, "rewards/chosen": 5.107089519500732, "rewards/margins": -0.9638357162475586, "rewards/rejected": 6.070925235748291, "step": 2041 }, { "epoch": 0.45, "learning_rate": 9.026887931530026e-06, "logits/chosen": -1.2694847583770752, "logits/rejected": -1.1729322671890259, "logps/chosen": -68.80067443847656, "logps/rejected": -56.16950988769531, "loss": 0.0985, "rewards/accuracies": 1.0, "rewards/chosen": 5.678784370422363, "rewards/margins": 2.0738747119903564, "rewards/rejected": 3.604909658432007, "step": 2042 }, { "epoch": 0.45, "learning_rate": 9.025825248620332e-06, "logits/chosen": -1.6938135623931885, "logits/rejected": -1.644911527633667, "logps/chosen": -53.471649169921875, "logps/rejected": -37.78550720214844, "loss": 0.3711, "rewards/accuracies": 1.0, "rewards/chosen": 2.2736968994140625, "rewards/margins": 0.2098219394683838, "rewards/rejected": 2.0638749599456787, "step": 2043 }, { "epoch": 0.45, "learning_rate": 9.024762048401775e-06, "logits/chosen": -1.397645354270935, "logits/rejected": -1.397645354270935, "logps/chosen": -91.73654174804688, "logps/rejected": -91.73654174804688, "loss": 0.3523, "rewards/accuracies": 0.0, "rewards/chosen": 6.1500091552734375, "rewards/margins": 0.0, "rewards/rejected": 6.1500091552734375, "step": 2044 }, { "epoch": 0.45, "learning_rate": 9.023698331010966e-06, "logits/chosen": -1.498343825340271, "logits/rejected": -1.5097579956054688, "logps/chosen": -83.71990966796875, "logps/rejected": -96.16813659667969, "loss": 1.3156, "rewards/accuracies": 0.0, "rewards/chosen": 3.6074821949005127, "rewards/margins": -2.536510705947876, "rewards/rejected": 6.143992900848389, "step": 2045 }, { "epoch": 0.45, "learning_rate": 9.022634096584597e-06, "logits/chosen": -1.5749815702438354, "logits/rejected": -1.5578763484954834, "logps/chosen": -51.59394836425781, "logps/rejected": -94.70611572265625, "loss": 0.4993, "rewards/accuracies": 0.0, "rewards/chosen": 1.7448959350585938, "rewards/margins": -0.036371588706970215, "rewards/rejected": 1.781267523765564, "step": 2046 }, { "epoch": 0.45, "learning_rate": 9.021569345259415e-06, "logits/chosen": -1.8517452478408813, "logits/rejected": -1.8265568017959595, "logps/chosen": -77.90800476074219, "logps/rejected": -58.28925704956055, "loss": 0.7103, "rewards/accuracies": 0.0, "rewards/chosen": 4.050232887268066, "rewards/margins": -1.1407666206359863, "rewards/rejected": 5.190999507904053, "step": 2047 }, { "epoch": 0.45, "learning_rate": 9.02050407717224e-06, "logits/chosen": -1.2723743915557861, "logits/rejected": -1.1659413576126099, "logps/chosen": -40.2166633605957, "logps/rejected": -16.565738677978516, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": 3.2986905574798584, "rewards/margins": 3.3962738513946533, "rewards/rejected": -0.09758319705724716, "step": 2048 }, { "epoch": 0.45, "learning_rate": 9.019438292459958e-06, "logits/chosen": -1.5732285976409912, "logits/rejected": -1.5996952056884766, "logps/chosen": -60.85298538208008, "logps/rejected": -79.91801452636719, "loss": 0.9103, "rewards/accuracies": 0.0, "rewards/chosen": 2.973789691925049, "rewards/margins": -1.513293743133545, "rewards/rejected": 4.487083435058594, "step": 2049 }, { "epoch": 0.45, "learning_rate": 9.018371991259516e-06, "logits/chosen": -1.733035922050476, "logits/rejected": -1.7131996154785156, "logps/chosen": -30.55313491821289, "logps/rejected": -51.69953918457031, "loss": 1.1895, "rewards/accuracies": 1.0, "rewards/chosen": 3.918968677520752, "rewards/margins": 0.8722248077392578, "rewards/rejected": 3.046743869781494, "step": 2050 }, { "epoch": 0.45, "learning_rate": 9.017305173707932e-06, "logits/chosen": -1.6395525932312012, "logits/rejected": -1.503159523010254, "logps/chosen": -136.23825073242188, "logps/rejected": -20.644630432128906, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": 6.074231147766113, "rewards/margins": 3.7552380561828613, "rewards/rejected": 2.318993091583252, "step": 2051 }, { "epoch": 0.45, "learning_rate": 9.016237839942294e-06, "logits/chosen": -0.9162720441818237, "logits/rejected": -0.8644344210624695, "logps/chosen": -40.493919372558594, "logps/rejected": -14.176881790161133, "loss": 0.7311, "rewards/accuracies": 1.0, "rewards/chosen": 1.7142181396484375, "rewards/margins": 0.37588441371917725, "rewards/rejected": 1.3383337259292603, "step": 2052 }, { "epoch": 0.45, "learning_rate": 9.015169990099746e-06, "logits/chosen": -1.1846504211425781, "logits/rejected": -1.187422275543213, "logps/chosen": -71.60807037353516, "logps/rejected": -100.83306884765625, "loss": 0.5911, "rewards/accuracies": 1.0, "rewards/chosen": 2.8146140575408936, "rewards/margins": 1.4401510953903198, "rewards/rejected": 1.3744629621505737, "step": 2053 }, { "epoch": 0.45, "learning_rate": 9.014101624317506e-06, "logits/chosen": -1.8287911415100098, "logits/rejected": -1.3935283422470093, "logps/chosen": -86.96578216552734, "logps/rejected": -60.25394821166992, "loss": 0.3545, "rewards/accuracies": 1.0, "rewards/chosen": 4.713334083557129, "rewards/margins": 2.515591859817505, "rewards/rejected": 2.197742223739624, "step": 2054 }, { "epoch": 0.45, "learning_rate": 9.013032742732858e-06, "logits/chosen": -1.481567144393921, "logits/rejected": -1.4626833200454712, "logps/chosen": -63.51097869873047, "logps/rejected": -90.27058410644531, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": 4.962606906890869, "rewards/margins": 2.5226426124572754, "rewards/rejected": 2.4399642944335938, "step": 2055 }, { "epoch": 0.46, "learning_rate": 9.01196334548315e-06, "logits/chosen": -1.3555339574813843, "logits/rejected": -1.4061506986618042, "logps/chosen": -57.53891372680664, "logps/rejected": -127.3739242553711, "loss": 2.2095, "rewards/accuracies": 0.0, "rewards/chosen": 4.006837844848633, "rewards/margins": -3.808986186981201, "rewards/rejected": 7.815824031829834, "step": 2056 }, { "epoch": 0.46, "learning_rate": 9.010893432705796e-06, "logits/chosen": -1.4412273168563843, "logits/rejected": -1.3940738439559937, "logps/chosen": -48.03199768066406, "logps/rejected": -50.17594528198242, "loss": 0.5061, "rewards/accuracies": 1.0, "rewards/chosen": 2.500563144683838, "rewards/margins": 0.5829838514328003, "rewards/rejected": 1.9175792932510376, "step": 2057 }, { "epoch": 0.46, "learning_rate": 9.009823004538278e-06, "logits/chosen": -1.6350765228271484, "logits/rejected": -0.9103372097015381, "logps/chosen": -83.5593032836914, "logps/rejected": -137.78912353515625, "loss": 1.2554, "rewards/accuracies": 0.0, "rewards/chosen": 2.988726854324341, "rewards/margins": -2.353253126144409, "rewards/rejected": 5.34197998046875, "step": 2058 }, { "epoch": 0.46, "learning_rate": 9.008752061118143e-06, "logits/chosen": -1.2440516948699951, "logits/rejected": -1.2440516948699951, "logps/chosen": -15.970422744750977, "logps/rejected": -15.970422744750977, "loss": 0.349, "rewards/accuracies": 0.0, "rewards/chosen": 2.2590372562408447, "rewards/margins": 0.0, "rewards/rejected": 2.2590372562408447, "step": 2059 }, { "epoch": 0.46, "learning_rate": 9.007680602583005e-06, "logits/chosen": -1.403097152709961, "logits/rejected": -1.4953136444091797, "logps/chosen": -59.122283935546875, "logps/rejected": -121.146728515625, "loss": 2.6061, "rewards/accuracies": 0.0, "rewards/chosen": 3.366076707839966, "rewards/margins": -5.206186294555664, "rewards/rejected": 8.57226276397705, "step": 2060 }, { "epoch": 0.46, "learning_rate": 9.006608629070543e-06, "logits/chosen": -1.5642286539077759, "logits/rejected": -1.6016608476638794, "logps/chosen": -162.07211303710938, "logps/rejected": -214.29183959960938, "loss": 0.6246, "rewards/accuracies": 0.0, "rewards/chosen": 5.871493816375732, "rewards/margins": -0.8423886299133301, "rewards/rejected": 6.7138824462890625, "step": 2061 }, { "epoch": 0.46, "learning_rate": 9.005536140718506e-06, "logits/chosen": -1.5601624250411987, "logits/rejected": -1.5716142654418945, "logps/chosen": -36.38755798339844, "logps/rejected": -50.71934509277344, "loss": 0.3089, "rewards/accuracies": 1.0, "rewards/chosen": 2.139723539352417, "rewards/margins": 0.16089129447937012, "rewards/rejected": 1.9788322448730469, "step": 2062 }, { "epoch": 0.46, "learning_rate": 9.004463137664701e-06, "logits/chosen": -1.2275420427322388, "logits/rejected": -1.113625407218933, "logps/chosen": -41.39905548095703, "logps/rejected": -5.636115074157715, "loss": 0.12, "rewards/accuracies": 1.0, "rewards/chosen": 2.6107537746429443, "rewards/margins": 1.5326365232467651, "rewards/rejected": 1.0781172513961792, "step": 2063 }, { "epoch": 0.46, "learning_rate": 9.003389620047012e-06, "logits/chosen": -1.5449557304382324, "logits/rejected": -1.502271294593811, "logps/chosen": -80.12092590332031, "logps/rejected": -60.83745574951172, "loss": 0.089, "rewards/accuracies": 1.0, "rewards/chosen": 4.4796462059021, "rewards/margins": 1.878908395767212, "rewards/rejected": 2.6007378101348877, "step": 2064 }, { "epoch": 0.46, "learning_rate": 9.002315588003378e-06, "logits/chosen": -1.4660221338272095, "logits/rejected": -1.4431588649749756, "logps/chosen": -53.879150390625, "logps/rejected": -77.50955200195312, "loss": 0.2652, "rewards/accuracies": 1.0, "rewards/chosen": 2.6110076904296875, "rewards/margins": 0.39220428466796875, "rewards/rejected": 2.2188034057617188, "step": 2065 }, { "epoch": 0.46, "learning_rate": 9.001241041671814e-06, "logits/chosen": -1.5582122802734375, "logits/rejected": -1.3134597539901733, "logps/chosen": -142.70809936523438, "logps/rejected": -67.17384338378906, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 6.767337322235107, "rewards/margins": 3.380746603012085, "rewards/rejected": 3.3865907192230225, "step": 2066 }, { "epoch": 0.46, "learning_rate": 9.000165981190396e-06, "logits/chosen": -1.6787033081054688, "logits/rejected": -1.6800496578216553, "logps/chosen": -66.68673706054688, "logps/rejected": -40.77141571044922, "loss": 0.8845, "rewards/accuracies": 0.0, "rewards/chosen": 1.294032335281372, "rewards/margins": -1.5754363536834717, "rewards/rejected": 2.8694686889648438, "step": 2067 }, { "epoch": 0.46, "learning_rate": 8.999090406697263e-06, "logits/chosen": -1.5338621139526367, "logits/rejected": -1.4906803369522095, "logps/chosen": -113.22516632080078, "logps/rejected": -104.68185424804688, "loss": 0.1634, "rewards/accuracies": 1.0, "rewards/chosen": 5.664217472076416, "rewards/margins": 1.1799430847167969, "rewards/rejected": 4.484274387359619, "step": 2068 }, { "epoch": 0.46, "learning_rate": 8.998014318330627e-06, "logits/chosen": -1.4592199325561523, "logits/rejected": -1.4696494340896606, "logps/chosen": -58.607696533203125, "logps/rejected": -40.687461853027344, "loss": 1.6086, "rewards/accuracies": 0.0, "rewards/chosen": 1.2245689630508423, "rewards/margins": -1.9593490362167358, "rewards/rejected": 3.183917999267578, "step": 2069 }, { "epoch": 0.46, "learning_rate": 8.996937716228763e-06, "logits/chosen": -1.4608111381530762, "logits/rejected": -1.377488613128662, "logps/chosen": -147.93792724609375, "logps/rejected": -140.2901153564453, "loss": 0.0967, "rewards/accuracies": 1.0, "rewards/chosen": 8.042633056640625, "rewards/margins": 1.6520156860351562, "rewards/rejected": 6.390617370605469, "step": 2070 }, { "epoch": 0.46, "learning_rate": 8.99586060053001e-06, "logits/chosen": -1.4503087997436523, "logits/rejected": -1.335655689239502, "logps/chosen": -47.38441467285156, "logps/rejected": -22.92757797241211, "loss": 0.2314, "rewards/accuracies": 1.0, "rewards/chosen": 3.734666585922241, "rewards/margins": 3.2707157135009766, "rewards/rejected": 0.4639509320259094, "step": 2071 }, { "epoch": 0.46, "learning_rate": 8.994782971372776e-06, "logits/chosen": -1.4621195793151855, "logits/rejected": -1.4621195793151855, "logps/chosen": -60.805511474609375, "logps/rejected": -60.805511474609375, "loss": 0.8422, "rewards/accuracies": 0.0, "rewards/chosen": 3.0632035732269287, "rewards/margins": 0.0, "rewards/rejected": 3.0632035732269287, "step": 2072 }, { "epoch": 0.46, "learning_rate": 8.993704828895533e-06, "logits/chosen": -1.8233683109283447, "logits/rejected": -1.7875926494598389, "logps/chosen": -88.7654800415039, "logps/rejected": -86.92866516113281, "loss": 0.1481, "rewards/accuracies": 1.0, "rewards/chosen": 6.106734752655029, "rewards/margins": 3.6843087673187256, "rewards/rejected": 2.4224259853363037, "step": 2073 }, { "epoch": 0.46, "learning_rate": 8.99262617323682e-06, "logits/chosen": -1.549971103668213, "logits/rejected": -1.4960262775421143, "logps/chosen": -55.4423828125, "logps/rejected": -9.420398712158203, "loss": 0.4748, "rewards/accuracies": 1.0, "rewards/chosen": 2.885449171066284, "rewards/margins": 1.229480266571045, "rewards/rejected": 1.6559689044952393, "step": 2074 }, { "epoch": 0.46, "learning_rate": 8.991547004535244e-06, "logits/chosen": -1.6954107284545898, "logits/rejected": -1.6763166189193726, "logps/chosen": -70.54901123046875, "logps/rejected": -47.14262008666992, "loss": 1.2795, "rewards/accuracies": 0.0, "rewards/chosen": 2.295452833175659, "rewards/margins": -2.0590946674346924, "rewards/rejected": 4.354547500610352, "step": 2075 }, { "epoch": 0.46, "learning_rate": 8.99046732292947e-06, "logits/chosen": -1.5809154510498047, "logits/rejected": -1.5747079849243164, "logps/chosen": -62.42905807495117, "logps/rejected": -80.58821868896484, "loss": 1.6715, "rewards/accuracies": 0.0, "rewards/chosen": 2.7361416816711426, "rewards/margins": -0.12560534477233887, "rewards/rejected": 2.8617470264434814, "step": 2076 }, { "epoch": 0.46, "learning_rate": 8.98938712855824e-06, "logits/chosen": -1.3674030303955078, "logits/rejected": -1.4114081859588623, "logps/chosen": -62.728729248046875, "logps/rejected": -56.834800720214844, "loss": 0.8957, "rewards/accuracies": 0.0, "rewards/chosen": 2.545611619949341, "rewards/margins": -0.8612105846405029, "rewards/rejected": 3.4068222045898438, "step": 2077 }, { "epoch": 0.46, "learning_rate": 8.988306421560354e-06, "logits/chosen": -1.762033224105835, "logits/rejected": -1.4743142127990723, "logps/chosen": -65.38645935058594, "logps/rejected": -144.23605346679688, "loss": 0.3155, "rewards/accuracies": 1.0, "rewards/chosen": 6.677975654602051, "rewards/margins": 0.1350541114807129, "rewards/rejected": 6.542921543121338, "step": 2078 }, { "epoch": 0.46, "learning_rate": 8.98722520207468e-06, "logits/chosen": -1.2330210208892822, "logits/rejected": -1.2330210208892822, "logps/chosen": -43.71470642089844, "logps/rejected": -43.71470642089844, "loss": 0.5175, "rewards/accuracies": 0.0, "rewards/chosen": 3.339421033859253, "rewards/margins": 0.0, "rewards/rejected": 3.339421033859253, "step": 2079 }, { "epoch": 0.46, "learning_rate": 8.986143470240152e-06, "logits/chosen": -1.3255399465560913, "logits/rejected": -1.2361931800842285, "logps/chosen": -52.57511901855469, "logps/rejected": -16.01488494873047, "loss": 0.056, "rewards/accuracies": 1.0, "rewards/chosen": 3.0600807666778564, "rewards/margins": 2.2992300987243652, "rewards/rejected": 0.7608505487442017, "step": 2080 }, { "epoch": 0.46, "learning_rate": 8.98506122619577e-06, "logits/chosen": -1.4457186460494995, "logits/rejected": -1.4487988948822021, "logps/chosen": -68.9534912109375, "logps/rejected": -89.4157485961914, "loss": 0.1137, "rewards/accuracies": 1.0, "rewards/chosen": 3.931727647781372, "rewards/margins": 1.4287199974060059, "rewards/rejected": 2.503007650375366, "step": 2081 }, { "epoch": 0.46, "learning_rate": 8.983978470080603e-06, "logits/chosen": -1.5225528478622437, "logits/rejected": -1.4745380878448486, "logps/chosen": -90.05329132080078, "logps/rejected": -107.87178039550781, "loss": 0.0768, "rewards/accuracies": 1.0, "rewards/chosen": 7.405341625213623, "rewards/margins": 1.8584041595458984, "rewards/rejected": 5.546937465667725, "step": 2082 }, { "epoch": 0.46, "learning_rate": 8.982895202033776e-06, "logits/chosen": -1.2849379777908325, "logits/rejected": -1.2601265907287598, "logps/chosen": -63.46548843383789, "logps/rejected": -60.67551803588867, "loss": 0.3474, "rewards/accuracies": 1.0, "rewards/chosen": 2.573643207550049, "rewards/margins": 0.35552072525024414, "rewards/rejected": 2.2181224822998047, "step": 2083 }, { "epoch": 0.46, "learning_rate": 8.981811422194493e-06, "logits/chosen": -1.4438098669052124, "logits/rejected": -1.4655362367630005, "logps/chosen": -57.01020050048828, "logps/rejected": -75.96688842773438, "loss": 0.6636, "rewards/accuracies": 0.0, "rewards/chosen": 2.71329665184021, "rewards/margins": -0.8634734153747559, "rewards/rejected": 3.576770067214966, "step": 2084 }, { "epoch": 0.46, "learning_rate": 8.980727130702014e-06, "logits/chosen": -1.5462141036987305, "logits/rejected": -1.5739468336105347, "logps/chosen": -107.93832397460938, "logps/rejected": -193.40013122558594, "loss": 0.3256, "rewards/accuracies": 1.0, "rewards/chosen": 9.225374221801758, "rewards/margins": 0.3644256591796875, "rewards/rejected": 8.86094856262207, "step": 2085 }, { "epoch": 0.46, "learning_rate": 8.979642327695668e-06, "logits/chosen": -1.3699666261672974, "logits/rejected": -1.3699666261672974, "logps/chosen": -29.289432525634766, "logps/rejected": -29.289432525634766, "loss": 0.3493, "rewards/accuracies": 0.0, "rewards/chosen": 3.654266834259033, "rewards/margins": 0.0, "rewards/rejected": 3.654266834259033, "step": 2086 }, { "epoch": 0.46, "learning_rate": 8.978557013314848e-06, "logits/chosen": -1.7549619674682617, "logits/rejected": -1.6643800735473633, "logps/chosen": -42.664695739746094, "logps/rejected": -28.231155395507812, "loss": 2.3849, "rewards/accuracies": 1.0, "rewards/chosen": 2.4791200160980225, "rewards/margins": 1.550870656967163, "rewards/rejected": 0.9282493591308594, "step": 2087 }, { "epoch": 0.46, "learning_rate": 8.977471187699019e-06, "logits/chosen": -1.3268061876296997, "logits/rejected": -1.4445791244506836, "logps/chosen": -68.783203125, "logps/rejected": -105.668701171875, "loss": 2.0508, "rewards/accuracies": 0.0, "rewards/chosen": 4.406381130218506, "rewards/margins": -2.9560275077819824, "rewards/rejected": 7.362408638000488, "step": 2088 }, { "epoch": 0.46, "learning_rate": 8.976384850987702e-06, "logits/chosen": -1.5276820659637451, "logits/rejected": -1.4736988544464111, "logps/chosen": -44.072715759277344, "logps/rejected": -48.56919860839844, "loss": 0.0696, "rewards/accuracies": 1.0, "rewards/chosen": 3.9701364040374756, "rewards/margins": 1.9099457263946533, "rewards/rejected": 2.0601906776428223, "step": 2089 }, { "epoch": 0.46, "learning_rate": 8.97529800332049e-06, "logits/chosen": -1.314393162727356, "logits/rejected": -1.3753644227981567, "logps/chosen": -92.33528137207031, "logps/rejected": -82.7613754272461, "loss": 1.106, "rewards/accuracies": 0.0, "rewards/chosen": 6.479438781738281, "rewards/margins": -2.0848445892333984, "rewards/rejected": 8.56428337097168, "step": 2090 }, { "epoch": 0.46, "learning_rate": 8.974210644837042e-06, "logits/chosen": -1.5159939527511597, "logits/rejected": -1.5977041721343994, "logps/chosen": -67.10648345947266, "logps/rejected": -148.18209838867188, "loss": 1.0869, "rewards/accuracies": 0.0, "rewards/chosen": 5.511162757873535, "rewards/margins": -1.9292349815368652, "rewards/rejected": 7.4403977394104, "step": 2091 }, { "epoch": 0.46, "learning_rate": 8.973122775677078e-06, "logits/chosen": -1.5662295818328857, "logits/rejected": -1.4926841259002686, "logps/chosen": -87.25363159179688, "logps/rejected": -60.26380920410156, "loss": 0.0894, "rewards/accuracies": 1.0, "rewards/chosen": 4.549859523773193, "rewards/margins": 1.9031767845153809, "rewards/rejected": 2.6466827392578125, "step": 2092 }, { "epoch": 0.46, "learning_rate": 8.97203439598039e-06, "logits/chosen": -1.4594035148620605, "logits/rejected": -1.4449796676635742, "logps/chosen": -42.72058868408203, "logps/rejected": -71.2755126953125, "loss": 0.7879, "rewards/accuracies": 1.0, "rewards/chosen": 4.015774726867676, "rewards/margins": 0.03571343421936035, "rewards/rejected": 3.9800612926483154, "step": 2093 }, { "epoch": 0.46, "learning_rate": 8.970945505886832e-06, "logits/chosen": -1.0844608545303345, "logits/rejected": -1.0006325244903564, "logps/chosen": -35.716373443603516, "logps/rejected": -10.143356323242188, "loss": 0.0855, "rewards/accuracies": 1.0, "rewards/chosen": 3.219965696334839, "rewards/margins": 1.9714194536209106, "rewards/rejected": 1.2485462427139282, "step": 2094 }, { "epoch": 0.46, "learning_rate": 8.96985610553632e-06, "logits/chosen": -1.3170582056045532, "logits/rejected": -1.3872612714767456, "logps/chosen": -108.45633697509766, "logps/rejected": -148.38970947265625, "loss": 3.6977, "rewards/accuracies": 0.0, "rewards/chosen": 3.197164297103882, "rewards/margins": -7.39146614074707, "rewards/rejected": 10.588630676269531, "step": 2095 }, { "epoch": 0.46, "learning_rate": 8.968766195068845e-06, "logits/chosen": -1.4862818717956543, "logits/rejected": -1.495531439781189, "logps/chosen": -139.6852264404297, "logps/rejected": -87.07962036132812, "loss": 1.4485, "rewards/accuracies": 0.0, "rewards/chosen": 5.157895088195801, "rewards/margins": -2.0529723167419434, "rewards/rejected": 7.210867404937744, "step": 2096 }, { "epoch": 0.46, "learning_rate": 8.967675774624451e-06, "logits/chosen": -1.3129538297653198, "logits/rejected": -1.080217719078064, "logps/chosen": -44.30547332763672, "logps/rejected": -72.77142333984375, "loss": 0.3729, "rewards/accuracies": 1.0, "rewards/chosen": 2.7924256324768066, "rewards/margins": 0.5024383068084717, "rewards/rejected": 2.289987325668335, "step": 2097 }, { "epoch": 0.46, "learning_rate": 8.96658484434326e-06, "logits/chosen": -1.459242820739746, "logits/rejected": -1.370400071144104, "logps/chosen": -197.79132080078125, "logps/rejected": -165.11932373046875, "loss": 0.9988, "rewards/accuracies": 0.0, "rewards/chosen": 5.375636577606201, "rewards/margins": -1.8506908416748047, "rewards/rejected": 7.226327419281006, "step": 2098 }, { "epoch": 0.46, "learning_rate": 8.96549340436545e-06, "logits/chosen": -1.34958815574646, "logits/rejected": -1.1383564472198486, "logps/chosen": -53.22142028808594, "logps/rejected": -56.94086837768555, "loss": 0.8292, "rewards/accuracies": 1.0, "rewards/chosen": 2.6518142223358154, "rewards/margins": 1.9111995697021484, "rewards/rejected": 0.7406147122383118, "step": 2099 }, { "epoch": 0.46, "learning_rate": 8.964401454831273e-06, "logits/chosen": -1.2562036514282227, "logits/rejected": -1.1323072910308838, "logps/chosen": -72.51116943359375, "logps/rejected": -60.7830810546875, "loss": 0.4655, "rewards/accuracies": 1.0, "rewards/chosen": 6.185901165008545, "rewards/margins": 4.460357189178467, "rewards/rejected": 1.7255439758300781, "step": 2100 }, { "epoch": 0.47, "learning_rate": 8.963308995881037e-06, "logits/chosen": -1.4135812520980835, "logits/rejected": -1.3974337577819824, "logps/chosen": -68.16355895996094, "logps/rejected": -53.33987045288086, "loss": 0.1395, "rewards/accuracies": 1.0, "rewards/chosen": 4.678001403808594, "rewards/margins": 1.166247844696045, "rewards/rejected": 3.511753559112549, "step": 2101 }, { "epoch": 0.47, "learning_rate": 8.962216027655123e-06, "logits/chosen": -1.3212897777557373, "logits/rejected": -1.3212897777557373, "logps/chosen": -17.515195846557617, "logps/rejected": -17.515195846557617, "loss": 0.5617, "rewards/accuracies": 0.0, "rewards/chosen": 2.6449060440063477, "rewards/margins": 0.0, "rewards/rejected": 2.6449060440063477, "step": 2102 }, { "epoch": 0.47, "learning_rate": 8.961122550293975e-06, "logits/chosen": -1.3869483470916748, "logits/rejected": -1.4326421022415161, "logps/chosen": -75.10790252685547, "logps/rejected": -78.41377258300781, "loss": 0.3981, "rewards/accuracies": 1.0, "rewards/chosen": 6.452926158905029, "rewards/margins": 2.2210464477539062, "rewards/rejected": 4.231879711151123, "step": 2103 }, { "epoch": 0.47, "learning_rate": 8.960028563938101e-06, "logits/chosen": -1.5750248432159424, "logits/rejected": -1.5284082889556885, "logps/chosen": -62.087677001953125, "logps/rejected": -46.413917541503906, "loss": 0.0929, "rewards/accuracies": 1.0, "rewards/chosen": 5.562127113342285, "rewards/margins": 2.958348274230957, "rewards/rejected": 2.603778839111328, "step": 2104 }, { "epoch": 0.47, "learning_rate": 8.958934068728078e-06, "logits/chosen": -1.5560435056686401, "logits/rejected": -1.430803894996643, "logps/chosen": -65.33430480957031, "logps/rejected": -15.644351959228516, "loss": 0.7511, "rewards/accuracies": 0.0, "rewards/chosen": 1.2314033508300781, "rewards/margins": -1.1673226356506348, "rewards/rejected": 2.398725986480713, "step": 2105 }, { "epoch": 0.47, "learning_rate": 8.957839064804542e-06, "logits/chosen": -1.2938950061798096, "logits/rejected": -1.2232717275619507, "logps/chosen": -41.543296813964844, "logps/rejected": -60.90016174316406, "loss": 0.5215, "rewards/accuracies": 0.0, "rewards/chosen": 2.4790284633636475, "rewards/margins": -0.23154067993164062, "rewards/rejected": 2.710569143295288, "step": 2106 }, { "epoch": 0.47, "learning_rate": 8.9567435523082e-06, "logits/chosen": -1.7081060409545898, "logits/rejected": -1.7224395275115967, "logps/chosen": -111.44813537597656, "logps/rejected": -85.32723236083984, "loss": 0.7224, "rewards/accuracies": 0.0, "rewards/chosen": 6.1543121337890625, "rewards/margins": -1.168555736541748, "rewards/rejected": 7.3228678703308105, "step": 2107 }, { "epoch": 0.47, "learning_rate": 8.955647531379826e-06, "logits/chosen": -1.6122303009033203, "logits/rejected": -1.421911358833313, "logps/chosen": -138.27926635742188, "logps/rejected": -26.643978118896484, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 6.41249418258667, "rewards/margins": 5.713319301605225, "rewards/rejected": 0.6991748809814453, "step": 2108 }, { "epoch": 0.47, "learning_rate": 8.954551002160252e-06, "logits/chosen": -1.6933211088180542, "logits/rejected": -1.6457463502883911, "logps/chosen": -73.13720703125, "logps/rejected": -34.190147399902344, "loss": 0.4973, "rewards/accuracies": 1.0, "rewards/chosen": 3.9748122692108154, "rewards/margins": 0.9744787216186523, "rewards/rejected": 3.000333547592163, "step": 2109 }, { "epoch": 0.47, "learning_rate": 8.95345396479038e-06, "logits/chosen": -1.6500219106674194, "logits/rejected": -1.6903237104415894, "logps/chosen": -65.87101745605469, "logps/rejected": -80.6676025390625, "loss": 1.9045, "rewards/accuracies": 0.0, "rewards/chosen": 2.811619520187378, "rewards/margins": -2.7532260417938232, "rewards/rejected": 5.564845561981201, "step": 2110 }, { "epoch": 0.47, "learning_rate": 8.952356419411177e-06, "logits/chosen": -1.6336655616760254, "logits/rejected": -1.624906301498413, "logps/chosen": -98.72352600097656, "logps/rejected": -75.05461120605469, "loss": 2.0157, "rewards/accuracies": 0.0, "rewards/chosen": 5.814245700836182, "rewards/margins": -0.19786357879638672, "rewards/rejected": 6.012109279632568, "step": 2111 }, { "epoch": 0.47, "learning_rate": 8.951258366163677e-06, "logits/chosen": -1.6371124982833862, "logits/rejected": -1.5256366729736328, "logps/chosen": -93.98153686523438, "logps/rejected": -85.54981994628906, "loss": 0.8856, "rewards/accuracies": 0.0, "rewards/chosen": 3.5734755992889404, "rewards/margins": -0.6621768474578857, "rewards/rejected": 4.235652446746826, "step": 2112 }, { "epoch": 0.47, "learning_rate": 8.950159805188973e-06, "logits/chosen": -1.389587640762329, "logits/rejected": -1.456666350364685, "logps/chosen": -33.84721374511719, "logps/rejected": -77.21501159667969, "loss": 1.4677, "rewards/accuracies": 0.0, "rewards/chosen": 3.665762424468994, "rewards/margins": -2.130021572113037, "rewards/rejected": 5.795783996582031, "step": 2113 }, { "epoch": 0.47, "learning_rate": 8.949060736628233e-06, "logits/chosen": -1.7540502548217773, "logits/rejected": -1.6619454622268677, "logps/chosen": -127.5364990234375, "logps/rejected": -47.67314529418945, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 6.063879489898682, "rewards/margins": 4.924230098724365, "rewards/rejected": 1.1396492719650269, "step": 2114 }, { "epoch": 0.47, "learning_rate": 8.94796116062268e-06, "logits/chosen": -1.6919605731964111, "logits/rejected": -1.6919605731964111, "logps/chosen": -29.762165069580078, "logps/rejected": -29.762165069580078, "loss": 2.1001, "rewards/accuracies": 0.0, "rewards/chosen": 1.513162612915039, "rewards/margins": 0.0, "rewards/rejected": 1.513162612915039, "step": 2115 }, { "epoch": 0.47, "learning_rate": 8.946861077313609e-06, "logits/chosen": -1.3720542192459106, "logits/rejected": -1.294965386390686, "logps/chosen": -73.88597106933594, "logps/rejected": -94.53086853027344, "loss": 2.5092, "rewards/accuracies": 0.0, "rewards/chosen": 2.602987766265869, "rewards/margins": -4.913548469543457, "rewards/rejected": 7.516536235809326, "step": 2116 }, { "epoch": 0.47, "learning_rate": 8.945760486842377e-06, "logits/chosen": -1.253852367401123, "logits/rejected": -1.1086628437042236, "logps/chosen": -60.03254699707031, "logps/rejected": -10.483508110046387, "loss": 2.6475, "rewards/accuracies": 1.0, "rewards/chosen": 6.394967555999756, "rewards/margins": 5.692267417907715, "rewards/rejected": 0.7026999592781067, "step": 2117 }, { "epoch": 0.47, "learning_rate": 8.944659389350409e-06, "logits/chosen": -1.4289891719818115, "logits/rejected": -1.4772698879241943, "logps/chosen": -121.02156829833984, "logps/rejected": -119.4166259765625, "loss": 1.0995, "rewards/accuracies": 0.0, "rewards/chosen": 5.665203094482422, "rewards/margins": -1.9068446159362793, "rewards/rejected": 7.572047710418701, "step": 2118 }, { "epoch": 0.47, "learning_rate": 8.94355778497919e-06, "logits/chosen": -1.3136364221572876, "logits/rejected": -1.0526301860809326, "logps/chosen": -111.76376342773438, "logps/rejected": -36.2327880859375, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 4.986569404602051, "rewards/margins": 3.942723274230957, "rewards/rejected": 1.0438461303710938, "step": 2119 }, { "epoch": 0.47, "learning_rate": 8.942455673870278e-06, "logits/chosen": -1.2603293657302856, "logits/rejected": -1.3111608028411865, "logps/chosen": -67.06156921386719, "logps/rejected": -84.60615539550781, "loss": 2.7296, "rewards/accuracies": 0.0, "rewards/chosen": 2.635873556137085, "rewards/margins": -5.37928581237793, "rewards/rejected": 8.015159606933594, "step": 2120 }, { "epoch": 0.47, "learning_rate": 8.941353056165288e-06, "logits/chosen": -1.2026233673095703, "logits/rejected": -1.2263435125350952, "logps/chosen": -39.081085205078125, "logps/rejected": -49.4810905456543, "loss": 2.6684, "rewards/accuracies": 0.0, "rewards/chosen": 1.383673906326294, "rewards/margins": -1.7852520942687988, "rewards/rejected": 3.1689260005950928, "step": 2121 }, { "epoch": 0.47, "learning_rate": 8.940249932005904e-06, "logits/chosen": -1.3046066761016846, "logits/rejected": -1.4339361190795898, "logps/chosen": -53.7240104675293, "logps/rejected": -100.78302001953125, "loss": 3.9576, "rewards/accuracies": 0.0, "rewards/chosen": 2.719588041305542, "rewards/margins": -4.296119689941406, "rewards/rejected": 7.015707492828369, "step": 2122 }, { "epoch": 0.47, "learning_rate": 8.939146301533878e-06, "logits/chosen": -1.4112401008605957, "logits/rejected": -1.3057804107666016, "logps/chosen": -43.27350616455078, "logps/rejected": -52.99061584472656, "loss": 1.1782, "rewards/accuracies": 0.0, "rewards/chosen": 2.424192190170288, "rewards/margins": -0.8763525485992432, "rewards/rejected": 3.3005447387695312, "step": 2123 }, { "epoch": 0.47, "learning_rate": 8.938042164891021e-06, "logits/chosen": -1.4613419771194458, "logits/rejected": -1.3311809301376343, "logps/chosen": -100.46324920654297, "logps/rejected": -25.96700096130371, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 6.1476969718933105, "rewards/margins": 4.848606109619141, "rewards/rejected": 1.2990907430648804, "step": 2124 }, { "epoch": 0.47, "learning_rate": 8.936937522219212e-06, "logits/chosen": -1.3830320835113525, "logits/rejected": -1.3178114891052246, "logps/chosen": -97.81997680664062, "logps/rejected": -166.26025390625, "loss": 0.3453, "rewards/accuracies": 1.0, "rewards/chosen": 6.005760192871094, "rewards/margins": 0.2337050437927246, "rewards/rejected": 5.772055149078369, "step": 2125 }, { "epoch": 0.47, "learning_rate": 8.935832373660397e-06, "logits/chosen": -1.517905831336975, "logits/rejected": -1.384196400642395, "logps/chosen": -56.59169387817383, "logps/rejected": -21.665027618408203, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 5.052000045776367, "rewards/margins": 3.1656694412231445, "rewards/rejected": 1.886330485343933, "step": 2126 }, { "epoch": 0.47, "learning_rate": 8.934726719356582e-06, "logits/chosen": -1.4707199335098267, "logits/rejected": -1.4859462976455688, "logps/chosen": -71.39480590820312, "logps/rejected": -58.09990692138672, "loss": 0.6614, "rewards/accuracies": 0.0, "rewards/chosen": 2.973378896713257, "rewards/margins": -0.1771697998046875, "rewards/rejected": 3.1505486965179443, "step": 2127 }, { "epoch": 0.47, "learning_rate": 8.933620559449842e-06, "logits/chosen": -1.344497561454773, "logits/rejected": -1.3616809844970703, "logps/chosen": -43.27301788330078, "logps/rejected": -56.32255172729492, "loss": 0.6049, "rewards/accuracies": 0.0, "rewards/chosen": 2.580596923828125, "rewards/margins": -0.7783801555633545, "rewards/rejected": 3.3589770793914795, "step": 2128 }, { "epoch": 0.47, "learning_rate": 8.932513894082317e-06, "logits/chosen": -1.7243127822875977, "logits/rejected": -1.6254193782806396, "logps/chosen": -54.63761901855469, "logps/rejected": -22.462827682495117, "loss": 0.8151, "rewards/accuracies": 1.0, "rewards/chosen": 1.620629906654358, "rewards/margins": 0.8401204943656921, "rewards/rejected": 0.7805094122886658, "step": 2129 }, { "epoch": 0.47, "learning_rate": 8.93140672339621e-06, "logits/chosen": -1.6784090995788574, "logits/rejected": -1.5956485271453857, "logps/chosen": -132.21578979492188, "logps/rejected": -102.19983673095703, "loss": 0.2084, "rewards/accuracies": 1.0, "rewards/chosen": 8.887591361999512, "rewards/margins": 3.6081900596618652, "rewards/rejected": 5.2794013023376465, "step": 2130 }, { "epoch": 0.47, "learning_rate": 8.930299047533792e-06, "logits/chosen": -1.538896918296814, "logits/rejected": -1.5066814422607422, "logps/chosen": -10.194381713867188, "logps/rejected": -17.080917358398438, "loss": 0.2768, "rewards/accuracies": 1.0, "rewards/chosen": 1.8195213079452515, "rewards/margins": 0.30489540100097656, "rewards/rejected": 1.514625906944275, "step": 2131 }, { "epoch": 0.47, "learning_rate": 8.929190866637391e-06, "logits/chosen": -1.7548389434814453, "logits/rejected": -1.7451924085617065, "logps/chosen": -77.86883544921875, "logps/rejected": -97.36654663085938, "loss": 1.2984, "rewards/accuracies": 0.0, "rewards/chosen": 4.771132946014404, "rewards/margins": -1.3097953796386719, "rewards/rejected": 6.080928325653076, "step": 2132 }, { "epoch": 0.47, "learning_rate": 8.92808218084941e-06, "logits/chosen": -1.445894479751587, "logits/rejected": -1.4553107023239136, "logps/chosen": -115.99662017822266, "logps/rejected": -223.47354125976562, "loss": 1.0453, "rewards/accuracies": 0.0, "rewards/chosen": 6.920730113983154, "rewards/margins": -1.9568095207214355, "rewards/rejected": 8.87753963470459, "step": 2133 }, { "epoch": 0.47, "learning_rate": 8.926972990312314e-06, "logits/chosen": -1.590467095375061, "logits/rejected": -1.5011013746261597, "logps/chosen": -122.0276107788086, "logps/rejected": -48.92165756225586, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 6.691048622131348, "rewards/margins": 4.3216142654418945, "rewards/rejected": 2.369434118270874, "step": 2134 }, { "epoch": 0.47, "learning_rate": 8.925863295168628e-06, "logits/chosen": -1.3589632511138916, "logits/rejected": -1.2361259460449219, "logps/chosen": -80.44100952148438, "logps/rejected": -23.722211837768555, "loss": 0.422, "rewards/accuracies": 1.0, "rewards/chosen": 2.837324619293213, "rewards/margins": 1.8156839609146118, "rewards/rejected": 1.021640658378601, "step": 2135 }, { "epoch": 0.47, "learning_rate": 8.924753095560945e-06, "logits/chosen": -1.5995368957519531, "logits/rejected": -1.5506755113601685, "logps/chosen": -31.75075340270996, "logps/rejected": -46.90910720825195, "loss": 0.5661, "rewards/accuracies": 0.0, "rewards/chosen": 2.274345636367798, "rewards/margins": -0.23887109756469727, "rewards/rejected": 2.513216733932495, "step": 2136 }, { "epoch": 0.47, "learning_rate": 8.923642391631924e-06, "logits/chosen": -1.159131407737732, "logits/rejected": -1.1785534620285034, "logps/chosen": -39.83594512939453, "logps/rejected": -77.91993713378906, "loss": 1.2601, "rewards/accuracies": 0.0, "rewards/chosen": 1.442834496498108, "rewards/margins": -2.401839256286621, "rewards/rejected": 3.8446738719940186, "step": 2137 }, { "epoch": 0.47, "learning_rate": 8.922531183524287e-06, "logits/chosen": -1.7583292722702026, "logits/rejected": -1.7751123905181885, "logps/chosen": -54.4456787109375, "logps/rejected": -39.99881362915039, "loss": 1.0683, "rewards/accuracies": 0.0, "rewards/chosen": 1.725716471672058, "rewards/margins": -1.3552058935165405, "rewards/rejected": 3.0809223651885986, "step": 2138 }, { "epoch": 0.47, "learning_rate": 8.921419471380826e-06, "logits/chosen": -1.495187759399414, "logits/rejected": -1.2736626863479614, "logps/chosen": -81.46733093261719, "logps/rejected": -199.0728759765625, "loss": 2.5311, "rewards/accuracies": 0.0, "rewards/chosen": 4.012170314788818, "rewards/margins": -5.050473690032959, "rewards/rejected": 9.062644004821777, "step": 2139 }, { "epoch": 0.47, "learning_rate": 8.920307255344386e-06, "logits/chosen": -1.2930184602737427, "logits/rejected": -1.268240213394165, "logps/chosen": -20.92515754699707, "logps/rejected": -74.71858215332031, "loss": 0.4069, "rewards/accuracies": 0.0, "rewards/chosen": 1.4309790134429932, "rewards/margins": -0.22069180011749268, "rewards/rejected": 1.6516708135604858, "step": 2140 }, { "epoch": 0.47, "learning_rate": 8.91919453555789e-06, "logits/chosen": -1.502298355102539, "logits/rejected": -1.507869839668274, "logps/chosen": -38.2952880859375, "logps/rejected": -43.29053497314453, "loss": 0.2779, "rewards/accuracies": 1.0, "rewards/chosen": 2.6364288330078125, "rewards/margins": 0.3588430881500244, "rewards/rejected": 2.277585744857788, "step": 2141 }, { "epoch": 0.47, "learning_rate": 8.918081312164318e-06, "logits/chosen": -1.345397710800171, "logits/rejected": -1.3122867345809937, "logps/chosen": -68.19692993164062, "logps/rejected": -70.70445251464844, "loss": 2.4112, "rewards/accuracies": 0.0, "rewards/chosen": 3.1056289672851562, "rewards/margins": -0.8828713893890381, "rewards/rejected": 3.9885003566741943, "step": 2142 }, { "epoch": 0.47, "learning_rate": 8.916967585306715e-06, "logits/chosen": -1.8117637634277344, "logits/rejected": -1.7969563007354736, "logps/chosen": -65.491943359375, "logps/rejected": -40.450401306152344, "loss": 1.3028, "rewards/accuracies": 0.0, "rewards/chosen": 2.5640275478363037, "rewards/margins": -0.9436728954315186, "rewards/rejected": 3.5077004432678223, "step": 2143 }, { "epoch": 0.47, "learning_rate": 8.915853355128192e-06, "logits/chosen": -1.4629945755004883, "logits/rejected": -1.3320130109786987, "logps/chosen": -48.195552825927734, "logps/rejected": -38.570133209228516, "loss": 0.1974, "rewards/accuracies": 1.0, "rewards/chosen": 2.273247241973877, "rewards/margins": 1.001305103302002, "rewards/rejected": 1.271942138671875, "step": 2144 }, { "epoch": 0.47, "learning_rate": 8.91473862177193e-06, "logits/chosen": -1.5346708297729492, "logits/rejected": -1.5975946187973022, "logps/chosen": -85.20022583007812, "logps/rejected": -74.3702163696289, "loss": 0.4695, "rewards/accuracies": 0.0, "rewards/chosen": 5.331428527832031, "rewards/margins": -0.07767438888549805, "rewards/rejected": 5.409102916717529, "step": 2145 }, { "epoch": 0.47, "learning_rate": 8.913623385381163e-06, "logits/chosen": -1.1425799131393433, "logits/rejected": -0.9561513066291809, "logps/chosen": -95.53288269042969, "logps/rejected": -56.866554260253906, "loss": 0.5432, "rewards/accuracies": 1.0, "rewards/chosen": 5.293684482574463, "rewards/margins": 3.5204765796661377, "rewards/rejected": 1.7732079029083252, "step": 2146 }, { "epoch": 0.48, "learning_rate": 8.9125076460992e-06, "logits/chosen": -1.4013434648513794, "logits/rejected": -1.4290859699249268, "logps/chosen": -51.481231689453125, "logps/rejected": -44.72638702392578, "loss": 1.4562, "rewards/accuracies": 0.0, "rewards/chosen": 2.318878173828125, "rewards/margins": -2.425520420074463, "rewards/rejected": 4.744398593902588, "step": 2147 }, { "epoch": 0.48, "learning_rate": 8.91139140406941e-06, "logits/chosen": -1.5664889812469482, "logits/rejected": -1.3850609064102173, "logps/chosen": -115.39995574951172, "logps/rejected": -95.01654052734375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 8.929784774780273, "rewards/margins": 5.063024520874023, "rewards/rejected": 3.86676025390625, "step": 2148 }, { "epoch": 0.48, "learning_rate": 8.910274659435226e-06, "logits/chosen": -1.6101421117782593, "logits/rejected": -1.5890454053878784, "logps/chosen": -40.219940185546875, "logps/rejected": -107.22663879394531, "loss": 0.8389, "rewards/accuracies": 0.0, "rewards/chosen": 2.374987840652466, "rewards/margins": -1.1797332763671875, "rewards/rejected": 3.5547211170196533, "step": 2149 }, { "epoch": 0.48, "learning_rate": 8.90915741234015e-06, "logits/chosen": -1.465417742729187, "logits/rejected": -1.3715205192565918, "logps/chosen": -82.15044403076172, "logps/rejected": -67.07736206054688, "loss": 0.2011, "rewards/accuracies": 1.0, "rewards/chosen": 2.541921377182007, "rewards/margins": 0.8284852504730225, "rewards/rejected": 1.7134361267089844, "step": 2150 }, { "epoch": 0.48, "learning_rate": 8.908039662927743e-06, "logits/chosen": -1.8321828842163086, "logits/rejected": -1.756115436553955, "logps/chosen": -94.8355712890625, "logps/rejected": -59.05923843383789, "loss": 0.4199, "rewards/accuracies": 1.0, "rewards/chosen": 6.962791442871094, "rewards/margins": 2.5410876274108887, "rewards/rejected": 4.421703815460205, "step": 2151 }, { "epoch": 0.48, "learning_rate": 8.906921411341634e-06, "logits/chosen": -1.3920104503631592, "logits/rejected": -1.4086897373199463, "logps/chosen": -57.085819244384766, "logps/rejected": -45.022705078125, "loss": 0.9359, "rewards/accuracies": 0.0, "rewards/chosen": 3.0159451961517334, "rewards/margins": -0.5262117385864258, "rewards/rejected": 3.542156934738159, "step": 2152 }, { "epoch": 0.48, "learning_rate": 8.905802657725516e-06, "logits/chosen": -1.5156164169311523, "logits/rejected": -1.4441661834716797, "logps/chosen": -115.81922149658203, "logps/rejected": -89.4717788696289, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 6.472048282623291, "rewards/margins": 3.711442708969116, "rewards/rejected": 2.760605573654175, "step": 2153 }, { "epoch": 0.48, "learning_rate": 8.904683402223146e-06, "logits/chosen": -1.504723072052002, "logits/rejected": -1.3491953611373901, "logps/chosen": -142.39984130859375, "logps/rejected": -40.33984375, "loss": 0.2689, "rewards/accuracies": 1.0, "rewards/chosen": 4.53582763671875, "rewards/margins": 3.680086851119995, "rewards/rejected": 0.8557407259941101, "step": 2154 }, { "epoch": 0.48, "learning_rate": 8.903563644978346e-06, "logits/chosen": -1.4874744415283203, "logits/rejected": -1.3712007999420166, "logps/chosen": -87.02494812011719, "logps/rejected": -35.35917663574219, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 7.20456075668335, "rewards/margins": 5.22037935256958, "rewards/rejected": 1.98418128490448, "step": 2155 }, { "epoch": 0.48, "learning_rate": 8.902443386135e-06, "logits/chosen": -1.5473113059997559, "logits/rejected": -1.521382451057434, "logps/chosen": -119.88076782226562, "logps/rejected": -130.31781005859375, "loss": 0.8033, "rewards/accuracies": 0.0, "rewards/chosen": 6.0100297927856445, "rewards/margins": -0.5271286964416504, "rewards/rejected": 6.537158489227295, "step": 2156 }, { "epoch": 0.48, "learning_rate": 8.90132262583706e-06, "logits/chosen": -1.2672977447509766, "logits/rejected": -1.2487053871154785, "logps/chosen": -29.965351104736328, "logps/rejected": -29.033252716064453, "loss": 0.3201, "rewards/accuracies": 1.0, "rewards/chosen": 1.5217045545578003, "rewards/margins": 0.6554409861564636, "rewards/rejected": 0.8662635684013367, "step": 2157 }, { "epoch": 0.48, "learning_rate": 8.900201364228542e-06, "logits/chosen": -1.6628774404525757, "logits/rejected": -1.7034825086593628, "logps/chosen": -105.82310485839844, "logps/rejected": -145.0869903564453, "loss": 1.1505, "rewards/accuracies": 0.0, "rewards/chosen": 7.1577606201171875, "rewards/margins": -2.068699836730957, "rewards/rejected": 9.226460456848145, "step": 2158 }, { "epoch": 0.48, "learning_rate": 8.899079601453524e-06, "logits/chosen": -1.2952992916107178, "logits/rejected": -1.255599021911621, "logps/chosen": -62.01496887207031, "logps/rejected": -40.49648666381836, "loss": 0.2037, "rewards/accuracies": 1.0, "rewards/chosen": 2.523703098297119, "rewards/margins": 0.6903507709503174, "rewards/rejected": 1.8333523273468018, "step": 2159 }, { "epoch": 0.48, "learning_rate": 8.897957337656151e-06, "logits/chosen": -1.5047099590301514, "logits/rejected": -1.5230404138565063, "logps/chosen": -23.592681884765625, "logps/rejected": -49.48622512817383, "loss": 0.2677, "rewards/accuracies": 1.0, "rewards/chosen": 3.5278680324554443, "rewards/margins": 0.3546407222747803, "rewards/rejected": 3.173227310180664, "step": 2160 }, { "epoch": 0.48, "learning_rate": 8.89683457298063e-06, "logits/chosen": -1.5414535999298096, "logits/rejected": -1.5414535999298096, "logps/chosen": -67.15576934814453, "logps/rejected": -67.15576934814453, "loss": 0.5284, "rewards/accuracies": 0.0, "rewards/chosen": 3.1219239234924316, "rewards/margins": 0.0, "rewards/rejected": 3.1219239234924316, "step": 2161 }, { "epoch": 0.48, "learning_rate": 8.895711307571235e-06, "logits/chosen": -1.8324382305145264, "logits/rejected": -1.8324382305145264, "logps/chosen": -63.6988525390625, "logps/rejected": -63.6988525390625, "loss": 0.3469, "rewards/accuracies": 0.0, "rewards/chosen": 5.093470096588135, "rewards/margins": 0.0, "rewards/rejected": 5.093470096588135, "step": 2162 }, { "epoch": 0.48, "learning_rate": 8.894587541572301e-06, "logits/chosen": -1.3826428651809692, "logits/rejected": -1.3795756101608276, "logps/chosen": -49.11212158203125, "logps/rejected": -96.63798522949219, "loss": 1.1461, "rewards/accuracies": 0.0, "rewards/chosen": 2.400775194168091, "rewards/margins": -1.373277187347412, "rewards/rejected": 3.774052381515503, "step": 2163 }, { "epoch": 0.48, "learning_rate": 8.89346327512823e-06, "logits/chosen": -1.5976771116256714, "logits/rejected": -1.4811043739318848, "logps/chosen": -128.5409698486328, "logps/rejected": -112.03510284423828, "loss": 0.1921, "rewards/accuracies": 1.0, "rewards/chosen": 4.341296672821045, "rewards/margins": 0.7935342788696289, "rewards/rejected": 3.547762393951416, "step": 2164 }, { "epoch": 0.48, "learning_rate": 8.89233850838349e-06, "logits/chosen": -1.4840327501296997, "logits/rejected": -1.3989083766937256, "logps/chosen": -65.30201721191406, "logps/rejected": -45.684383392333984, "loss": 0.7559, "rewards/accuracies": 0.0, "rewards/chosen": 2.1455674171447754, "rewards/margins": -0.8144733905792236, "rewards/rejected": 2.960040807723999, "step": 2165 }, { "epoch": 0.48, "learning_rate": 8.891213241482606e-06, "logits/chosen": -1.6308671236038208, "logits/rejected": -1.6378742456436157, "logps/chosen": -63.025856018066406, "logps/rejected": -67.81353759765625, "loss": 0.3269, "rewards/accuracies": 1.0, "rewards/chosen": 3.4819953441619873, "rewards/margins": 0.13304519653320312, "rewards/rejected": 3.348950147628784, "step": 2166 }, { "epoch": 0.48, "learning_rate": 8.890087474570174e-06, "logits/chosen": -1.6835367679595947, "logits/rejected": -1.6827232837677002, "logps/chosen": -49.363380432128906, "logps/rejected": -70.84098052978516, "loss": 1.7637, "rewards/accuracies": 0.0, "rewards/chosen": 1.7092392444610596, "rewards/margins": -3.463308095932007, "rewards/rejected": 5.172547340393066, "step": 2167 }, { "epoch": 0.48, "learning_rate": 8.888961207790856e-06, "logits/chosen": -1.45384681224823, "logits/rejected": -1.4410715103149414, "logps/chosen": -48.45159912109375, "logps/rejected": -49.26361083984375, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 5.3231201171875, "rewards/margins": 2.3689329624176025, "rewards/rejected": 2.9541871547698975, "step": 2168 }, { "epoch": 0.48, "learning_rate": 8.887834441289369e-06, "logits/chosen": -1.3603795766830444, "logits/rejected": -1.1897729635238647, "logps/chosen": -48.69259262084961, "logps/rejected": -68.40652465820312, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 2.629993200302124, "rewards/margins": 3.3429524898529053, "rewards/rejected": -0.7129592895507812, "step": 2169 }, { "epoch": 0.48, "learning_rate": 8.886707175210503e-06, "logits/chosen": -1.3325302600860596, "logits/rejected": -1.2653838396072388, "logps/chosen": -47.27534103393555, "logps/rejected": -11.356432914733887, "loss": 0.0581, "rewards/accuracies": 1.0, "rewards/chosen": 4.6230244636535645, "rewards/margins": 4.141806602478027, "rewards/rejected": 0.4812180697917938, "step": 2170 }, { "epoch": 0.48, "learning_rate": 8.88557940969911e-06, "logits/chosen": -1.6077972650527954, "logits/rejected": -1.4506258964538574, "logps/chosen": -59.694271087646484, "logps/rejected": -40.14674758911133, "loss": 0.5613, "rewards/accuracies": 1.0, "rewards/chosen": 2.1244161128997803, "rewards/margins": 1.684867262840271, "rewards/rejected": 0.43954887986183167, "step": 2171 }, { "epoch": 0.48, "learning_rate": 8.884451144900104e-06, "logits/chosen": -1.3577792644500732, "logits/rejected": -1.3654401302337646, "logps/chosen": -61.735862731933594, "logps/rejected": -35.05879592895508, "loss": 0.5634, "rewards/accuracies": 0.0, "rewards/chosen": 1.7198814153671265, "rewards/margins": -0.5428050756454468, "rewards/rejected": 2.2626864910125732, "step": 2172 }, { "epoch": 0.48, "learning_rate": 8.88332238095846e-06, "logits/chosen": -1.4065109491348267, "logits/rejected": -1.2920230627059937, "logps/chosen": -59.36878204345703, "logps/rejected": -59.56403350830078, "loss": 0.1729, "rewards/accuracies": 1.0, "rewards/chosen": 4.283900737762451, "rewards/margins": 0.8967082500457764, "rewards/rejected": 3.387192487716675, "step": 2173 }, { "epoch": 0.48, "learning_rate": 8.882193118019229e-06, "logits/chosen": -1.197271466255188, "logits/rejected": -1.166878581047058, "logps/chosen": -69.83695220947266, "logps/rejected": -75.66696166992188, "loss": 0.7956, "rewards/accuracies": 0.0, "rewards/chosen": 3.3479440212249756, "rewards/margins": -0.20255112648010254, "rewards/rejected": 3.550495147705078, "step": 2174 }, { "epoch": 0.48, "learning_rate": 8.881063356227513e-06, "logits/chosen": -1.3781737089157104, "logits/rejected": -1.3554221391677856, "logps/chosen": -77.84345245361328, "logps/rejected": -36.52757263183594, "loss": 0.4708, "rewards/accuracies": 1.0, "rewards/chosen": 3.4058997631073, "rewards/margins": 1.215095043182373, "rewards/rejected": 2.1908047199249268, "step": 2175 }, { "epoch": 0.48, "learning_rate": 8.879933095728485e-06, "logits/chosen": -1.5348291397094727, "logits/rejected": -1.4569798707962036, "logps/chosen": -107.3177490234375, "logps/rejected": -22.497764587402344, "loss": 1.0733, "rewards/accuracies": 1.0, "rewards/chosen": 5.905602931976318, "rewards/margins": 5.0930047035217285, "rewards/rejected": 0.8125980496406555, "step": 2176 }, { "epoch": 0.48, "learning_rate": 8.878802336667384e-06, "logits/chosen": -1.7625072002410889, "logits/rejected": -1.7778843641281128, "logps/chosen": -50.76654815673828, "logps/rejected": -102.33970642089844, "loss": 2.474, "rewards/accuracies": 0.0, "rewards/chosen": 3.036461591720581, "rewards/margins": -1.936382532119751, "rewards/rejected": 4.972844123840332, "step": 2177 }, { "epoch": 0.48, "learning_rate": 8.877671079189505e-06, "logits/chosen": -1.9190198183059692, "logits/rejected": -1.8231316804885864, "logps/chosen": -166.65142822265625, "logps/rejected": -55.15204620361328, "loss": 0.2934, "rewards/accuracies": 1.0, "rewards/chosen": 5.958957195281982, "rewards/margins": 0.2505350112915039, "rewards/rejected": 5.7084221839904785, "step": 2178 }, { "epoch": 0.48, "learning_rate": 8.876539323440214e-06, "logits/chosen": -1.3140126466751099, "logits/rejected": -1.3006583452224731, "logps/chosen": -28.118255615234375, "logps/rejected": -76.40299987792969, "loss": 1.4483, "rewards/accuracies": 0.0, "rewards/chosen": 2.340506076812744, "rewards/margins": -2.8089003562927246, "rewards/rejected": 5.149406433105469, "step": 2179 }, { "epoch": 0.48, "learning_rate": 8.87540706956494e-06, "logits/chosen": -1.3334230184555054, "logits/rejected": -1.2719396352767944, "logps/chosen": -50.62617492675781, "logps/rejected": -26.116531372070312, "loss": 0.5014, "rewards/accuracies": 1.0, "rewards/chosen": 5.391324043273926, "rewards/margins": 3.0445005893707275, "rewards/rejected": 2.3468234539031982, "step": 2180 }, { "epoch": 0.48, "learning_rate": 8.874274317709173e-06, "logits/chosen": -1.3310084342956543, "logits/rejected": -1.3716821670532227, "logps/chosen": -44.01082992553711, "logps/rejected": -100.88996124267578, "loss": 1.9913, "rewards/accuracies": 0.0, "rewards/chosen": 2.628807544708252, "rewards/margins": -3.367097854614258, "rewards/rejected": 5.99590539932251, "step": 2181 }, { "epoch": 0.48, "learning_rate": 8.873141068018469e-06, "logits/chosen": -1.5277920961380005, "logits/rejected": -1.5256201028823853, "logps/chosen": -61.90214538574219, "logps/rejected": -64.95631408691406, "loss": 0.9483, "rewards/accuracies": 1.0, "rewards/chosen": 2.2798354625701904, "rewards/margins": 0.1518707275390625, "rewards/rejected": 2.127964735031128, "step": 2182 }, { "epoch": 0.48, "learning_rate": 8.872007320638449e-06, "logits/chosen": -1.4336521625518799, "logits/rejected": -1.3750219345092773, "logps/chosen": -33.956398010253906, "logps/rejected": -13.907346725463867, "loss": 0.4135, "rewards/accuracies": 0.0, "rewards/chosen": 0.7444305419921875, "rewards/margins": -0.2381097674369812, "rewards/rejected": 0.9825403094291687, "step": 2183 }, { "epoch": 0.48, "learning_rate": 8.870873075714797e-06, "logits/chosen": -1.5999081134796143, "logits/rejected": -1.5999081134796143, "logps/chosen": -42.08331298828125, "logps/rejected": -42.08331298828125, "loss": 0.4998, "rewards/accuracies": 0.0, "rewards/chosen": 3.066668748855591, "rewards/margins": 0.0, "rewards/rejected": 3.066668748855591, "step": 2184 }, { "epoch": 0.48, "learning_rate": 8.86973833339326e-06, "logits/chosen": -1.6315187215805054, "logits/rejected": -1.6416337490081787, "logps/chosen": -43.105594635009766, "logps/rejected": -70.12682342529297, "loss": 0.7403, "rewards/accuracies": 0.0, "rewards/chosen": 2.2586522102355957, "rewards/margins": -0.18042492866516113, "rewards/rejected": 2.439077138900757, "step": 2185 }, { "epoch": 0.48, "learning_rate": 8.86860309381965e-06, "logits/chosen": -1.3052301406860352, "logits/rejected": -1.3052301406860352, "logps/chosen": -41.35494613647461, "logps/rejected": -41.35494613647461, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 3.7642078399658203, "rewards/margins": 0.0, "rewards/rejected": 3.7642078399658203, "step": 2186 }, { "epoch": 0.48, "learning_rate": 8.867467357139842e-06, "logits/chosen": -1.461767554283142, "logits/rejected": -1.4016402959823608, "logps/chosen": -87.69595336914062, "logps/rejected": -145.75778198242188, "loss": 0.1655, "rewards/accuracies": 1.0, "rewards/chosen": 7.565005779266357, "rewards/margins": 5.060294151306152, "rewards/rejected": 2.504711866378784, "step": 2187 }, { "epoch": 0.48, "learning_rate": 8.866331123499775e-06, "logits/chosen": -1.271718144416809, "logits/rejected": -1.196192741394043, "logps/chosen": -91.15963745117188, "logps/rejected": -47.547950744628906, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": 5.12123441696167, "rewards/margins": 3.094205617904663, "rewards/rejected": 2.027028799057007, "step": 2188 }, { "epoch": 0.48, "learning_rate": 8.865194393045452e-06, "logits/chosen": -1.5190346240997314, "logits/rejected": -1.572365641593933, "logps/chosen": -33.296573638916016, "logps/rejected": -134.8706817626953, "loss": 1.6713, "rewards/accuracies": 0.0, "rewards/chosen": 2.299165725708008, "rewards/margins": -3.290849208831787, "rewards/rejected": 5.590014934539795, "step": 2189 }, { "epoch": 0.48, "learning_rate": 8.864057165922944e-06, "logits/chosen": -1.3672329187393188, "logits/rejected": -1.2068818807601929, "logps/chosen": -38.45458221435547, "logps/rejected": -19.28037452697754, "loss": 1.8623, "rewards/accuracies": 1.0, "rewards/chosen": 4.6112895011901855, "rewards/margins": 4.011529445648193, "rewards/rejected": 0.5997602343559265, "step": 2190 }, { "epoch": 0.48, "learning_rate": 8.862919442278379e-06, "logits/chosen": -1.4920824766159058, "logits/rejected": -1.4920824766159058, "logps/chosen": -62.19920349121094, "logps/rejected": -62.19920349121094, "loss": 0.3628, "rewards/accuracies": 0.0, "rewards/chosen": 4.06411600112915, "rewards/margins": 0.0, "rewards/rejected": 4.06411600112915, "step": 2191 }, { "epoch": 0.49, "learning_rate": 8.86178122225795e-06, "logits/chosen": -1.5798102617263794, "logits/rejected": -1.5305677652359009, "logps/chosen": -57.84489059448242, "logps/rejected": -25.237703323364258, "loss": 4.0355, "rewards/accuracies": 0.0, "rewards/chosen": 1.23334538936615, "rewards/margins": -1.1619104146957397, "rewards/rejected": 2.3952558040618896, "step": 2192 }, { "epoch": 0.49, "learning_rate": 8.860642506007919e-06, "logits/chosen": -1.2943284511566162, "logits/rejected": -1.1663471460342407, "logps/chosen": -55.06110382080078, "logps/rejected": -9.681551933288574, "loss": 0.0512, "rewards/accuracies": 1.0, "rewards/chosen": 3.6940505504608154, "rewards/margins": 2.274231195449829, "rewards/rejected": 1.4198193550109863, "step": 2193 }, { "epoch": 0.49, "learning_rate": 8.859503293674605e-06, "logits/chosen": -1.4579628705978394, "logits/rejected": -1.3756502866744995, "logps/chosen": -45.864784240722656, "logps/rejected": -60.420616149902344, "loss": 0.5443, "rewards/accuracies": 1.0, "rewards/chosen": 2.361445665359497, "rewards/margins": 0.16411352157592773, "rewards/rejected": 2.1973321437835693, "step": 2194 }, { "epoch": 0.49, "learning_rate": 8.858363585404397e-06, "logits/chosen": -1.4175446033477783, "logits/rejected": -1.3710986375808716, "logps/chosen": -43.02475357055664, "logps/rejected": -36.37818145751953, "loss": 0.6481, "rewards/accuracies": 0.0, "rewards/chosen": 2.308558225631714, "rewards/margins": -0.926377534866333, "rewards/rejected": 3.234935760498047, "step": 2195 }, { "epoch": 0.49, "learning_rate": 8.857223381343742e-06, "logits/chosen": -1.3558744192123413, "logits/rejected": -1.4818071126937866, "logps/chosen": -97.00747680664062, "logps/rejected": -75.11835479736328, "loss": 2.3191, "rewards/accuracies": 0.0, "rewards/chosen": 3.1957626342773438, "rewards/margins": -4.6222758293151855, "rewards/rejected": 7.818038463592529, "step": 2196 }, { "epoch": 0.49, "learning_rate": 8.856082681639158e-06, "logits/chosen": -1.538662314414978, "logits/rejected": -1.3640834093093872, "logps/chosen": -62.273338317871094, "logps/rejected": -41.20037841796875, "loss": 0.4459, "rewards/accuracies": 1.0, "rewards/chosen": 2.4372498989105225, "rewards/margins": 1.5447168350219727, "rewards/rejected": 0.8925331234931946, "step": 2197 }, { "epoch": 0.49, "learning_rate": 8.854941486437216e-06, "logits/chosen": -1.445303201675415, "logits/rejected": -1.3449251651763916, "logps/chosen": -43.925384521484375, "logps/rejected": -80.6171646118164, "loss": 1.606, "rewards/accuracies": 0.0, "rewards/chosen": 2.963191270828247, "rewards/margins": -3.1678354740142822, "rewards/rejected": 6.131026744842529, "step": 2198 }, { "epoch": 0.49, "learning_rate": 8.853799795884562e-06, "logits/chosen": -1.1102697849273682, "logits/rejected": -1.1102697849273682, "logps/chosen": -57.61277770996094, "logps/rejected": -57.61277770996094, "loss": 1.2954, "rewards/accuracies": 0.0, "rewards/chosen": 2.519030809402466, "rewards/margins": 0.0, "rewards/rejected": 2.519030809402466, "step": 2199 }, { "epoch": 0.49, "learning_rate": 8.852657610127898e-06, "logits/chosen": -1.5437448024749756, "logits/rejected": -1.6075047254562378, "logps/chosen": -185.29019165039062, "logps/rejected": -89.41812133789062, "loss": 0.1092, "rewards/accuracies": 1.0, "rewards/chosen": 9.15714168548584, "rewards/margins": 1.4151248931884766, "rewards/rejected": 7.742016792297363, "step": 2200 }, { "epoch": 0.49, "learning_rate": 8.851514929313992e-06, "logits/chosen": -1.9844471216201782, "logits/rejected": -1.944766879081726, "logps/chosen": -104.27511596679688, "logps/rejected": -80.38945007324219, "loss": 0.6205, "rewards/accuracies": 1.0, "rewards/chosen": 6.360592842102051, "rewards/margins": 0.9634475708007812, "rewards/rejected": 5.3971452713012695, "step": 2201 }, { "epoch": 0.49, "learning_rate": 8.850371753589677e-06, "logits/chosen": -1.5610806941986084, "logits/rejected": -1.49651300907135, "logps/chosen": -41.88690185546875, "logps/rejected": -37.904396057128906, "loss": 1.6523, "rewards/accuracies": 0.0, "rewards/chosen": 1.8135433197021484, "rewards/margins": -0.28658103942871094, "rewards/rejected": 2.1001243591308594, "step": 2202 }, { "epoch": 0.49, "learning_rate": 8.849228083101847e-06, "logits/chosen": -1.4652488231658936, "logits/rejected": -1.4503769874572754, "logps/chosen": -48.975799560546875, "logps/rejected": -54.95353317260742, "loss": 0.6802, "rewards/accuracies": 0.0, "rewards/chosen": 2.272493839263916, "rewards/margins": -0.4941675662994385, "rewards/rejected": 2.7666614055633545, "step": 2203 }, { "epoch": 0.49, "learning_rate": 8.848083917997463e-06, "logits/chosen": -1.5924453735351562, "logits/rejected": -1.5822619199752808, "logps/chosen": -155.8612060546875, "logps/rejected": -169.87881469726562, "loss": 0.5566, "rewards/accuracies": 0.0, "rewards/chosen": 7.483001708984375, "rewards/margins": -0.6941671371459961, "rewards/rejected": 8.177168846130371, "step": 2204 }, { "epoch": 0.49, "learning_rate": 8.846939258423545e-06, "logits/chosen": -1.2813509702682495, "logits/rejected": -1.2813509702682495, "logps/chosen": -33.511070251464844, "logps/rejected": -33.511070251464844, "loss": 0.3654, "rewards/accuracies": 0.0, "rewards/chosen": 1.7054790258407593, "rewards/margins": 0.0, "rewards/rejected": 1.7054790258407593, "step": 2205 }, { "epoch": 0.49, "learning_rate": 8.84579410452718e-06, "logits/chosen": -1.5126910209655762, "logits/rejected": -1.3601598739624023, "logps/chosen": -52.1873779296875, "logps/rejected": -25.820003509521484, "loss": 0.431, "rewards/accuracies": 1.0, "rewards/chosen": 2.653987169265747, "rewards/margins": 1.6936657428741455, "rewards/rejected": 0.9603214263916016, "step": 2206 }, { "epoch": 0.49, "learning_rate": 8.844648456455518e-06, "logits/chosen": -1.5900236368179321, "logits/rejected": -1.5589144229888916, "logps/chosen": -32.892032623291016, "logps/rejected": -50.87153244018555, "loss": 1.0091, "rewards/accuracies": 1.0, "rewards/chosen": 2.6576809883117676, "rewards/margins": 0.17744064331054688, "rewards/rejected": 2.4802403450012207, "step": 2207 }, { "epoch": 0.49, "learning_rate": 8.843502314355771e-06, "logits/chosen": -1.6406813859939575, "logits/rejected": -1.5132216215133667, "logps/chosen": -129.5105438232422, "logps/rejected": -88.86051940917969, "loss": 0.28, "rewards/accuracies": 1.0, "rewards/chosen": 6.045753479003906, "rewards/margins": 0.2923583984375, "rewards/rejected": 5.753395080566406, "step": 2208 }, { "epoch": 0.49, "learning_rate": 8.842355678375217e-06, "logits/chosen": -1.4727855920791626, "logits/rejected": -1.3422104120254517, "logps/chosen": -59.140960693359375, "logps/rejected": -25.924898147583008, "loss": 0.7175, "rewards/accuracies": 1.0, "rewards/chosen": 3.638989210128784, "rewards/margins": 0.9912686347961426, "rewards/rejected": 2.6477205753326416, "step": 2209 }, { "epoch": 0.49, "learning_rate": 8.841208548661195e-06, "logits/chosen": -1.9857062101364136, "logits/rejected": -2.034838914871216, "logps/chosen": -116.27644348144531, "logps/rejected": -68.52535247802734, "loss": 2.8358, "rewards/accuracies": 1.0, "rewards/chosen": 8.828302383422852, "rewards/margins": 4.011585712432861, "rewards/rejected": 4.81671667098999, "step": 2210 }, { "epoch": 0.49, "learning_rate": 8.840060925361109e-06, "logits/chosen": -1.5259888172149658, "logits/rejected": -1.4851572513580322, "logps/chosen": -98.46817779541016, "logps/rejected": -187.08633422851562, "loss": 0.7473, "rewards/accuracies": 0.0, "rewards/chosen": 4.651772499084473, "rewards/margins": -0.9325141906738281, "rewards/rejected": 5.584286689758301, "step": 2211 }, { "epoch": 0.49, "learning_rate": 8.838912808622424e-06, "logits/chosen": -1.5725650787353516, "logits/rejected": -1.313204050064087, "logps/chosen": -39.09490203857422, "logps/rejected": -73.51570892333984, "loss": 1.2439, "rewards/accuracies": 0.0, "rewards/chosen": 2.270646810531616, "rewards/margins": -1.612884521484375, "rewards/rejected": 3.883531332015991, "step": 2212 }, { "epoch": 0.49, "learning_rate": 8.837764198592672e-06, "logits/chosen": -1.2841947078704834, "logits/rejected": -1.2807211875915527, "logps/chosen": -197.9622039794922, "logps/rejected": -54.45703125, "loss": 0.5653, "rewards/accuracies": 0.0, "rewards/chosen": 6.016728401184082, "rewards/margins": -0.7384781837463379, "rewards/rejected": 6.75520658493042, "step": 2213 }, { "epoch": 0.49, "learning_rate": 8.836615095419448e-06, "logits/chosen": -1.2104231119155884, "logits/rejected": -1.2547030448913574, "logps/chosen": -13.387690544128418, "logps/rejected": -36.0666389465332, "loss": 1.502, "rewards/accuracies": 0.0, "rewards/chosen": 0.9484372138977051, "rewards/margins": -2.2064764499664307, "rewards/rejected": 3.1549136638641357, "step": 2214 }, { "epoch": 0.49, "learning_rate": 8.835465499250404e-06, "logits/chosen": -1.5874665975570679, "logits/rejected": -1.573519229888916, "logps/chosen": -37.730010986328125, "logps/rejected": -46.78752517700195, "loss": 1.1201, "rewards/accuracies": 0.0, "rewards/chosen": 1.5553802251815796, "rewards/margins": -1.3077083826065063, "rewards/rejected": 2.863088607788086, "step": 2215 }, { "epoch": 0.49, "learning_rate": 8.834315410233264e-06, "logits/chosen": -1.6798655986785889, "logits/rejected": -1.5517497062683105, "logps/chosen": -105.43173217773438, "logps/rejected": -81.56165313720703, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": 7.561773777008057, "rewards/margins": 4.780478477478027, "rewards/rejected": 2.78129506111145, "step": 2216 }, { "epoch": 0.49, "learning_rate": 8.833164828515815e-06, "logits/chosen": -1.5354766845703125, "logits/rejected": -1.523221492767334, "logps/chosen": -106.47932434082031, "logps/rejected": -131.78485107421875, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": 7.668586730957031, "rewards/margins": 2.743516445159912, "rewards/rejected": 4.925070285797119, "step": 2217 }, { "epoch": 0.49, "learning_rate": 8.832013754245895e-06, "logits/chosen": -2.0371077060699463, "logits/rejected": -2.0090298652648926, "logps/chosen": -76.57479095458984, "logps/rejected": -61.57355499267578, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 8.09848403930664, "rewards/margins": 4.2218451499938965, "rewards/rejected": 3.876638889312744, "step": 2218 }, { "epoch": 0.49, "learning_rate": 8.830862187571423e-06, "logits/chosen": -1.554915428161621, "logits/rejected": -1.4167934656143188, "logps/chosen": -59.322235107421875, "logps/rejected": -23.720321655273438, "loss": 0.2841, "rewards/accuracies": 1.0, "rewards/chosen": 2.659926652908325, "rewards/margins": 1.441952109336853, "rewards/rejected": 1.2179745435714722, "step": 2219 }, { "epoch": 0.49, "learning_rate": 8.829710128640368e-06, "logits/chosen": -1.4735362529754639, "logits/rejected": -1.4652985334396362, "logps/chosen": -134.98983764648438, "logps/rejected": -134.47381591796875, "loss": 0.5832, "rewards/accuracies": 1.0, "rewards/chosen": 8.263648986816406, "rewards/margins": 6.050023078918457, "rewards/rejected": 2.2136261463165283, "step": 2220 }, { "epoch": 0.49, "learning_rate": 8.828557577600769e-06, "logits/chosen": -1.3439972400665283, "logits/rejected": -1.3138031959533691, "logps/chosen": -38.40135955810547, "logps/rejected": -23.477081298828125, "loss": 1.304, "rewards/accuracies": 0.0, "rewards/chosen": 1.996344804763794, "rewards/margins": -1.7553558349609375, "rewards/rejected": 3.7517006397247314, "step": 2221 }, { "epoch": 0.49, "learning_rate": 8.827404534600723e-06, "logits/chosen": -1.2072386741638184, "logits/rejected": -1.1317012310028076, "logps/chosen": -76.65554809570312, "logps/rejected": -65.96971130371094, "loss": 1.5127, "rewards/accuracies": 0.0, "rewards/chosen": 2.401663303375244, "rewards/margins": -2.5164389610290527, "rewards/rejected": 4.918102264404297, "step": 2222 }, { "epoch": 0.49, "learning_rate": 8.826250999788397e-06, "logits/chosen": -1.7369343042373657, "logits/rejected": -1.6961780786514282, "logps/chosen": -93.97297668457031, "logps/rejected": -69.7472152709961, "loss": 1.8615, "rewards/accuracies": 1.0, "rewards/chosen": 4.921902656555176, "rewards/margins": 1.414160966873169, "rewards/rejected": 3.507741689682007, "step": 2223 }, { "epoch": 0.49, "learning_rate": 8.825096973312014e-06, "logits/chosen": -1.7362135648727417, "logits/rejected": -1.4896492958068848, "logps/chosen": -65.01766967773438, "logps/rejected": -63.683311462402344, "loss": 2.2093, "rewards/accuracies": 0.0, "rewards/chosen": 0.961987316608429, "rewards/margins": -4.298898220062256, "rewards/rejected": 5.260885715484619, "step": 2224 }, { "epoch": 0.49, "learning_rate": 8.823942455319866e-06, "logits/chosen": -1.4698504209518433, "logits/rejected": -1.383453130722046, "logps/chosen": -44.15193557739258, "logps/rejected": -46.13689422607422, "loss": 0.5484, "rewards/accuracies": 1.0, "rewards/chosen": 2.828526735305786, "rewards/margins": 0.10775256156921387, "rewards/rejected": 2.7207741737365723, "step": 2225 }, { "epoch": 0.49, "learning_rate": 8.822787445960303e-06, "logits/chosen": -1.7226349115371704, "logits/rejected": -1.6818009614944458, "logps/chosen": -81.00435638427734, "logps/rejected": -40.32227325439453, "loss": 1.4369, "rewards/accuracies": 1.0, "rewards/chosen": 5.10650110244751, "rewards/margins": 1.1423416137695312, "rewards/rejected": 3.9641594886779785, "step": 2226 }, { "epoch": 0.49, "learning_rate": 8.821631945381746e-06, "logits/chosen": -1.3216675519943237, "logits/rejected": -1.2134695053100586, "logps/chosen": -56.575469970703125, "logps/rejected": -26.579425811767578, "loss": 2.4011, "rewards/accuracies": 1.0, "rewards/chosen": 2.2512481212615967, "rewards/margins": 2.4737255573272705, "rewards/rejected": -0.22247754037380219, "step": 2227 }, { "epoch": 0.49, "learning_rate": 8.82047595373267e-06, "logits/chosen": -1.4528597593307495, "logits/rejected": -1.4966604709625244, "logps/chosen": -39.307456970214844, "logps/rejected": -120.30245971679688, "loss": 1.202, "rewards/accuracies": 0.0, "rewards/chosen": 3.0462913513183594, "rewards/margins": -1.8298654556274414, "rewards/rejected": 4.876156806945801, "step": 2228 }, { "epoch": 0.49, "learning_rate": 8.819319471161617e-06, "logits/chosen": -1.4004870653152466, "logits/rejected": -1.4940186738967896, "logps/chosen": -95.74327087402344, "logps/rejected": -79.79087829589844, "loss": 2.833, "rewards/accuracies": 0.0, "rewards/chosen": 3.2146241664886475, "rewards/margins": -5.405271530151367, "rewards/rejected": 8.619895935058594, "step": 2229 }, { "epoch": 0.49, "learning_rate": 8.818162497817195e-06, "logits/chosen": -1.5858615636825562, "logits/rejected": -1.676996111869812, "logps/chosen": -58.48883819580078, "logps/rejected": -137.3859100341797, "loss": 2.0484, "rewards/accuracies": 0.0, "rewards/chosen": 1.8348839282989502, "rewards/margins": -4.060152053833008, "rewards/rejected": 5.895036220550537, "step": 2230 }, { "epoch": 0.49, "learning_rate": 8.81700503384807e-06, "logits/chosen": -1.6372179985046387, "logits/rejected": -1.4110066890716553, "logps/chosen": -28.633609771728516, "logps/rejected": -21.228103637695312, "loss": 1.2707, "rewards/accuracies": 1.0, "rewards/chosen": 2.4289448261260986, "rewards/margins": 0.3666977882385254, "rewards/rejected": 2.0622470378875732, "step": 2231 }, { "epoch": 0.49, "learning_rate": 8.815847079402972e-06, "logits/chosen": -1.446936845779419, "logits/rejected": -1.4114861488342285, "logps/chosen": -26.768558502197266, "logps/rejected": -64.87847900390625, "loss": 1.8854, "rewards/accuracies": 0.0, "rewards/chosen": 1.7879489660263062, "rewards/margins": -0.43841683864593506, "rewards/rejected": 2.226365804672241, "step": 2232 }, { "epoch": 0.49, "learning_rate": 8.814688634630699e-06, "logits/chosen": -1.7095937728881836, "logits/rejected": -1.7095937728881836, "logps/chosen": -87.24765014648438, "logps/rejected": -87.24765014648438, "loss": 0.3505, "rewards/accuracies": 0.0, "rewards/chosen": 5.371182441711426, "rewards/margins": 0.0, "rewards/rejected": 5.371182441711426, "step": 2233 }, { "epoch": 0.49, "learning_rate": 8.813529699680108e-06, "logits/chosen": -1.771551489830017, "logits/rejected": -1.7868552207946777, "logps/chosen": -106.30958557128906, "logps/rejected": -106.81926727294922, "loss": 0.8798, "rewards/accuracies": 0.0, "rewards/chosen": 5.81441068649292, "rewards/margins": -0.9885473251342773, "rewards/rejected": 6.802958011627197, "step": 2234 }, { "epoch": 0.49, "learning_rate": 8.812370274700117e-06, "logits/chosen": -1.59578537940979, "logits/rejected": -1.569969892501831, "logps/chosen": -53.56194305419922, "logps/rejected": -35.032623291015625, "loss": 0.3297, "rewards/accuracies": 1.0, "rewards/chosen": 5.066913604736328, "rewards/margins": 3.0683181285858154, "rewards/rejected": 1.9985954761505127, "step": 2235 }, { "epoch": 0.49, "learning_rate": 8.81121035983971e-06, "logits/chosen": -1.778569221496582, "logits/rejected": -1.72881281375885, "logps/chosen": -126.7080078125, "logps/rejected": -92.77593994140625, "loss": 1.7069, "rewards/accuracies": 0.0, "rewards/chosen": 6.291708469390869, "rewards/margins": -1.1090102195739746, "rewards/rejected": 7.400718688964844, "step": 2236 }, { "epoch": 0.5, "learning_rate": 8.810049955247933e-06, "logits/chosen": -1.3456525802612305, "logits/rejected": -1.2223769426345825, "logps/chosen": -33.357643127441406, "logps/rejected": -15.817535400390625, "loss": 0.5626, "rewards/accuracies": 1.0, "rewards/chosen": 1.7286041975021362, "rewards/margins": 0.615607738494873, "rewards/rejected": 1.1129964590072632, "step": 2237 }, { "epoch": 0.5, "learning_rate": 8.808889061073897e-06, "logits/chosen": -1.5883663892745972, "logits/rejected": -1.60744309425354, "logps/chosen": -52.70875930786133, "logps/rejected": -63.06646728515625, "loss": 1.9085, "rewards/accuracies": 0.0, "rewards/chosen": 1.4940731525421143, "rewards/margins": -2.097538471221924, "rewards/rejected": 3.591611623764038, "step": 2238 }, { "epoch": 0.5, "learning_rate": 8.807727677466773e-06, "logits/chosen": -1.7260991334915161, "logits/rejected": -1.6963772773742676, "logps/chosen": -82.4110107421875, "logps/rejected": -76.75148010253906, "loss": 0.9702, "rewards/accuracies": 0.0, "rewards/chosen": 2.8727662563323975, "rewards/margins": -1.6022183895111084, "rewards/rejected": 4.474984645843506, "step": 2239 }, { "epoch": 0.5, "learning_rate": 8.806565804575796e-06, "logits/chosen": -1.6927770376205444, "logits/rejected": -1.7658668756484985, "logps/chosen": -66.0339584350586, "logps/rejected": -97.66082000732422, "loss": 2.1514, "rewards/accuracies": 0.0, "rewards/chosen": 3.4261696338653564, "rewards/margins": -3.582257032394409, "rewards/rejected": 7.008426666259766, "step": 2240 }, { "epoch": 0.5, "learning_rate": 8.805403442550261e-06, "logits/chosen": -1.6289246082305908, "logits/rejected": -1.5180517435073853, "logps/chosen": -59.52345275878906, "logps/rejected": -53.4659538269043, "loss": 0.1225, "rewards/accuracies": 1.0, "rewards/chosen": 4.468069553375244, "rewards/margins": 1.4246563911437988, "rewards/rejected": 3.0434131622314453, "step": 2241 }, { "epoch": 0.5, "learning_rate": 8.804240591539537e-06, "logits/chosen": -1.5178195238113403, "logits/rejected": -1.458476185798645, "logps/chosen": -65.56439208984375, "logps/rejected": -55.061790466308594, "loss": 0.4021, "rewards/accuracies": 1.0, "rewards/chosen": 5.781165599822998, "rewards/margins": 2.1418795585632324, "rewards/rejected": 3.6392860412597656, "step": 2242 }, { "epoch": 0.5, "learning_rate": 8.80307725169304e-06, "logits/chosen": -1.2900439500808716, "logits/rejected": -1.2732723951339722, "logps/chosen": -60.48771286010742, "logps/rejected": -80.21964263916016, "loss": 0.2618, "rewards/accuracies": 1.0, "rewards/chosen": 3.0236356258392334, "rewards/margins": 1.0700322389602661, "rewards/rejected": 1.9536033868789673, "step": 2243 }, { "epoch": 0.5, "learning_rate": 8.801913423160256e-06, "logits/chosen": -1.5129845142364502, "logits/rejected": -1.3757472038269043, "logps/chosen": -126.7613754272461, "logps/rejected": -30.597084045410156, "loss": 0.2144, "rewards/accuracies": 1.0, "rewards/chosen": 5.064341068267822, "rewards/margins": 0.9514045715332031, "rewards/rejected": 4.112936496734619, "step": 2244 }, { "epoch": 0.5, "learning_rate": 8.800749106090739e-06, "logits/chosen": -1.6019446849822998, "logits/rejected": -1.484455943107605, "logps/chosen": -51.27043533325195, "logps/rejected": -68.93590545654297, "loss": 0.8374, "rewards/accuracies": 0.0, "rewards/chosen": 2.555711030960083, "rewards/margins": -1.0727832317352295, "rewards/rejected": 3.6284942626953125, "step": 2245 }, { "epoch": 0.5, "learning_rate": 8.799584300634096e-06, "logits/chosen": -1.1788969039916992, "logits/rejected": -1.1608763933181763, "logps/chosen": -37.52777862548828, "logps/rejected": -56.28457260131836, "loss": 2.023, "rewards/accuracies": 1.0, "rewards/chosen": 2.5674588680267334, "rewards/margins": 0.22003865242004395, "rewards/rejected": 2.3474202156066895, "step": 2246 }, { "epoch": 0.5, "learning_rate": 8.798419006940008e-06, "logits/chosen": -1.3598891496658325, "logits/rejected": -1.3559226989746094, "logps/chosen": -58.219520568847656, "logps/rejected": -76.81806945800781, "loss": 3.0489, "rewards/accuracies": 0.0, "rewards/chosen": 3.5150749683380127, "rewards/margins": -0.9209311008453369, "rewards/rejected": 4.43600606918335, "step": 2247 }, { "epoch": 0.5, "learning_rate": 8.797253225158206e-06, "logits/chosen": -1.4833104610443115, "logits/rejected": -1.1426482200622559, "logps/chosen": -135.9611358642578, "logps/rejected": -21.555461883544922, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 6.277235507965088, "rewards/margins": 5.410494804382324, "rewards/rejected": 0.8667408227920532, "step": 2248 }, { "epoch": 0.5, "learning_rate": 8.796086955438494e-06, "logits/chosen": -1.3826677799224854, "logits/rejected": -1.3472504615783691, "logps/chosen": -70.0141372680664, "logps/rejected": -59.946556091308594, "loss": 0.2069, "rewards/accuracies": 1.0, "rewards/chosen": 3.599355459213257, "rewards/margins": 0.6713197231292725, "rewards/rejected": 2.9280357360839844, "step": 2249 }, { "epoch": 0.5, "learning_rate": 8.794920197930735e-06, "logits/chosen": -1.1683924198150635, "logits/rejected": -1.1573601961135864, "logps/chosen": -6.815900802612305, "logps/rejected": -3.3215060234069824, "loss": 0.4472, "rewards/accuracies": 0.0, "rewards/chosen": 0.6745530962944031, "rewards/margins": -0.3306148648262024, "rewards/rejected": 1.0051679611206055, "step": 2250 }, { "epoch": 0.5, "learning_rate": 8.79375295278485e-06, "logits/chosen": -1.8009144067764282, "logits/rejected": -1.8265223503112793, "logps/chosen": -41.89366149902344, "logps/rejected": -99.94937133789062, "loss": 1.1336, "rewards/accuracies": 0.0, "rewards/chosen": 3.349541425704956, "rewards/margins": -1.3230478763580322, "rewards/rejected": 4.672589302062988, "step": 2251 }, { "epoch": 0.5, "learning_rate": 8.792585220150834e-06, "logits/chosen": -1.7151857614517212, "logits/rejected": -1.3479933738708496, "logps/chosen": -67.61945343017578, "logps/rejected": -72.33210754394531, "loss": 0.5542, "rewards/accuracies": 1.0, "rewards/chosen": 3.21579909324646, "rewards/margins": 0.03506779670715332, "rewards/rejected": 3.1807312965393066, "step": 2252 }, { "epoch": 0.5, "learning_rate": 8.791417000178732e-06, "logits/chosen": -1.8654024600982666, "logits/rejected": -1.8290836811065674, "logps/chosen": -87.09725952148438, "logps/rejected": -61.425628662109375, "loss": 0.0768, "rewards/accuracies": 1.0, "rewards/chosen": 6.8597564697265625, "rewards/margins": 3.849879503250122, "rewards/rejected": 3.0098769664764404, "step": 2253 }, { "epoch": 0.5, "learning_rate": 8.790248293018662e-06, "logits/chosen": -1.4386261701583862, "logits/rejected": -1.3550974130630493, "logps/chosen": -29.61516571044922, "logps/rejected": -14.188592910766602, "loss": 0.2412, "rewards/accuracies": 1.0, "rewards/chosen": 1.9017280340194702, "rewards/margins": 0.848139762878418, "rewards/rejected": 1.0535882711410522, "step": 2254 }, { "epoch": 0.5, "learning_rate": 8.789079098820796e-06, "logits/chosen": -1.3085416555404663, "logits/rejected": -1.3085416555404663, "logps/chosen": -3.5699210166931152, "logps/rejected": -3.5699210166931152, "loss": 0.3492, "rewards/accuracies": 0.0, "rewards/chosen": 1.353692889213562, "rewards/margins": 0.0, "rewards/rejected": 1.353692889213562, "step": 2255 }, { "epoch": 0.5, "learning_rate": 8.787909417735374e-06, "logits/chosen": -1.441380262374878, "logits/rejected": -1.441380262374878, "logps/chosen": -38.500038146972656, "logps/rejected": -38.500038146972656, "loss": 0.8716, "rewards/accuracies": 0.0, "rewards/chosen": 1.8969672918319702, "rewards/margins": 0.0, "rewards/rejected": 1.8969672918319702, "step": 2256 }, { "epoch": 0.5, "learning_rate": 8.7867392499127e-06, "logits/chosen": -1.5033124685287476, "logits/rejected": -1.4379926919937134, "logps/chosen": -43.704254150390625, "logps/rejected": -23.531787872314453, "loss": 0.2383, "rewards/accuracies": 1.0, "rewards/chosen": 2.4859459400177, "rewards/margins": 0.5112408399581909, "rewards/rejected": 1.9747051000595093, "step": 2257 }, { "epoch": 0.5, "learning_rate": 8.785568595503134e-06, "logits/chosen": -1.7110064029693604, "logits/rejected": -1.599283218383789, "logps/chosen": -115.35015869140625, "logps/rejected": -48.463714599609375, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 5.851222515106201, "rewards/margins": 3.2074434757232666, "rewards/rejected": 2.6437790393829346, "step": 2258 }, { "epoch": 0.5, "learning_rate": 8.784397454657103e-06, "logits/chosen": -1.5963472127914429, "logits/rejected": -1.585010051727295, "logps/chosen": -93.04402923583984, "logps/rejected": -62.812835693359375, "loss": 0.4937, "rewards/accuracies": 0.0, "rewards/chosen": 2.9447357654571533, "rewards/margins": -0.4412720203399658, "rewards/rejected": 3.386007785797119, "step": 2259 }, { "epoch": 0.5, "learning_rate": 8.783225827525098e-06, "logits/chosen": -1.9218989610671997, "logits/rejected": -1.8663620948791504, "logps/chosen": -94.75931549072266, "logps/rejected": -149.930419921875, "loss": 1.2562, "rewards/accuracies": 0.0, "rewards/chosen": 6.346987247467041, "rewards/margins": -2.3777318000793457, "rewards/rejected": 8.724719047546387, "step": 2260 }, { "epoch": 0.5, "learning_rate": 8.782053714257668e-06, "logits/chosen": -1.2220491170883179, "logits/rejected": -1.2220491170883179, "logps/chosen": -25.883535385131836, "logps/rejected": -25.883535385131836, "loss": 0.3486, "rewards/accuracies": 0.0, "rewards/chosen": 2.0366837978363037, "rewards/margins": 0.0, "rewards/rejected": 2.0366837978363037, "step": 2261 }, { "epoch": 0.5, "learning_rate": 8.780881115005428e-06, "logits/chosen": -1.886290431022644, "logits/rejected": -1.862180233001709, "logps/chosen": -67.94015502929688, "logps/rejected": -102.12873840332031, "loss": 0.8927, "rewards/accuracies": 0.0, "rewards/chosen": 4.9953813552856445, "rewards/margins": -1.3831663131713867, "rewards/rejected": 6.378547668457031, "step": 2262 }, { "epoch": 0.5, "learning_rate": 8.779708029919054e-06, "logits/chosen": -1.6303918361663818, "logits/rejected": -1.6001077890396118, "logps/chosen": -57.86945343017578, "logps/rejected": -38.902442932128906, "loss": 0.3378, "rewards/accuracies": 1.0, "rewards/chosen": 3.345329999923706, "rewards/margins": 0.5628805160522461, "rewards/rejected": 2.78244948387146, "step": 2263 }, { "epoch": 0.5, "learning_rate": 8.778534459149283e-06, "logits/chosen": -1.6777324676513672, "logits/rejected": -1.5894923210144043, "logps/chosen": -68.19725036621094, "logps/rejected": -50.48365020751953, "loss": 0.2503, "rewards/accuracies": 1.0, "rewards/chosen": 4.048020362854004, "rewards/margins": 0.49655842781066895, "rewards/rejected": 3.551461935043335, "step": 2264 }, { "epoch": 0.5, "learning_rate": 8.777360402846919e-06, "logits/chosen": -1.6312075853347778, "logits/rejected": -1.643522024154663, "logps/chosen": -67.71488952636719, "logps/rejected": -100.16722869873047, "loss": 1.7917, "rewards/accuracies": 0.0, "rewards/chosen": 3.372854709625244, "rewards/margins": -3.4908103942871094, "rewards/rejected": 6.8636651039123535, "step": 2265 }, { "epoch": 0.5, "learning_rate": 8.776185861162822e-06, "logits/chosen": -1.8178274631500244, "logits/rejected": -1.7505203485488892, "logps/chosen": -50.98719787597656, "logps/rejected": -60.21698760986328, "loss": 0.8502, "rewards/accuracies": 0.0, "rewards/chosen": 1.7093842029571533, "rewards/margins": -1.4423561096191406, "rewards/rejected": 3.151740312576294, "step": 2266 }, { "epoch": 0.5, "learning_rate": 8.77501083424792e-06, "logits/chosen": -1.6432467699050903, "logits/rejected": -1.5577881336212158, "logps/chosen": -131.57733154296875, "logps/rejected": -43.85734558105469, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": 7.075399875640869, "rewards/margins": 1.9629240036010742, "rewards/rejected": 5.112475872039795, "step": 2267 }, { "epoch": 0.5, "learning_rate": 8.773835322253202e-06, "logits/chosen": -1.3555940389633179, "logits/rejected": -1.369126558303833, "logps/chosen": -59.33143997192383, "logps/rejected": -60.56526184082031, "loss": 2.5772, "rewards/accuracies": 0.0, "rewards/chosen": 2.0753934383392334, "rewards/margins": -4.776445388793945, "rewards/rejected": 6.8518385887146, "step": 2268 }, { "epoch": 0.5, "learning_rate": 8.772659325329717e-06, "logits/chosen": -1.2616055011749268, "logits/rejected": -1.2616055011749268, "logps/chosen": -51.542842864990234, "logps/rejected": -51.542842864990234, "loss": 0.9352, "rewards/accuracies": 0.0, "rewards/chosen": 5.381733417510986, "rewards/margins": 0.0, "rewards/rejected": 5.381733417510986, "step": 2269 }, { "epoch": 0.5, "learning_rate": 8.771482843628576e-06, "logits/chosen": -1.3563467264175415, "logits/rejected": -1.2521088123321533, "logps/chosen": -98.70918273925781, "logps/rejected": -100.8111343383789, "loss": 0.1576, "rewards/accuracies": 1.0, "rewards/chosen": 6.5078630447387695, "rewards/margins": 2.8755457401275635, "rewards/rejected": 3.632317304611206, "step": 2270 }, { "epoch": 0.5, "learning_rate": 8.770305877300958e-06, "logits/chosen": -1.6013234853744507, "logits/rejected": -1.6013234853744507, "logps/chosen": -32.360801696777344, "logps/rejected": -32.360801696777344, "loss": 0.6909, "rewards/accuracies": 0.0, "rewards/chosen": 4.121250152587891, "rewards/margins": 0.0, "rewards/rejected": 4.121250152587891, "step": 2271 }, { "epoch": 0.5, "learning_rate": 8.769128426498098e-06, "logits/chosen": -1.5205206871032715, "logits/rejected": -1.3984085321426392, "logps/chosen": -85.39177703857422, "logps/rejected": -57.19049835205078, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 8.448652267456055, "rewards/margins": 7.002810478210449, "rewards/rejected": 1.4458420276641846, "step": 2272 }, { "epoch": 0.5, "learning_rate": 8.767950491371295e-06, "logits/chosen": -1.3661054372787476, "logits/rejected": -1.438233494758606, "logps/chosen": -63.527061462402344, "logps/rejected": -108.54733276367188, "loss": 3.2125, "rewards/accuracies": 0.0, "rewards/chosen": 2.4042916297912598, "rewards/margins": -5.846102237701416, "rewards/rejected": 8.250393867492676, "step": 2273 }, { "epoch": 0.5, "learning_rate": 8.766772072071911e-06, "logits/chosen": -1.2147129774093628, "logits/rejected": -1.307060718536377, "logps/chosen": -65.2246322631836, "logps/rejected": -171.9818115234375, "loss": 2.6831, "rewards/accuracies": 0.0, "rewards/chosen": 1.485981822013855, "rewards/margins": -5.201655387878418, "rewards/rejected": 6.6876373291015625, "step": 2274 }, { "epoch": 0.5, "learning_rate": 8.765593168751373e-06, "logits/chosen": -1.3575341701507568, "logits/rejected": -1.3759340047836304, "logps/chosen": -95.89961242675781, "logps/rejected": -96.33617401123047, "loss": 0.6655, "rewards/accuracies": 1.0, "rewards/chosen": 6.423895359039307, "rewards/margins": 0.40837955474853516, "rewards/rejected": 6.0155158042907715, "step": 2275 }, { "epoch": 0.5, "learning_rate": 8.764413781561164e-06, "logits/chosen": -1.2970428466796875, "logits/rejected": -1.1887791156768799, "logps/chosen": -105.03842163085938, "logps/rejected": -64.0486068725586, "loss": 0.1112, "rewards/accuracies": 1.0, "rewards/chosen": 5.548126220703125, "rewards/margins": 2.5757408142089844, "rewards/rejected": 2.9723854064941406, "step": 2276 }, { "epoch": 0.5, "learning_rate": 8.763233910652833e-06, "logits/chosen": -1.780717134475708, "logits/rejected": -1.7847254276275635, "logps/chosen": -73.0715103149414, "logps/rejected": -57.303794860839844, "loss": 1.2217, "rewards/accuracies": 0.0, "rewards/chosen": 0.8936454653739929, "rewards/margins": -2.343287706375122, "rewards/rejected": 3.2369332313537598, "step": 2277 }, { "epoch": 0.5, "learning_rate": 8.762053556177991e-06, "logits/chosen": -1.5034559965133667, "logits/rejected": -1.4670592546463013, "logps/chosen": -56.252113342285156, "logps/rejected": -46.168968200683594, "loss": 0.902, "rewards/accuracies": 0.0, "rewards/chosen": 3.4575142860412598, "rewards/margins": -1.5815529823303223, "rewards/rejected": 5.039067268371582, "step": 2278 }, { "epoch": 0.5, "learning_rate": 8.760872718288311e-06, "logits/chosen": -1.5372544527053833, "logits/rejected": -1.4938920736312866, "logps/chosen": -76.91675567626953, "logps/rejected": -94.82902526855469, "loss": 0.6266, "rewards/accuracies": 0.0, "rewards/chosen": 2.3661201000213623, "rewards/margins": -0.4914581775665283, "rewards/rejected": 2.8575782775878906, "step": 2279 }, { "epoch": 0.5, "learning_rate": 8.759691397135528e-06, "logits/chosen": -1.2978776693344116, "logits/rejected": -1.2978776693344116, "logps/chosen": -64.25948333740234, "logps/rejected": -64.25948333740234, "loss": 0.7613, "rewards/accuracies": 0.0, "rewards/chosen": 2.4216887950897217, "rewards/margins": 0.0, "rewards/rejected": 2.4216887950897217, "step": 2280 }, { "epoch": 0.5, "learning_rate": 8.758509592871439e-06, "logits/chosen": -1.5382765531539917, "logits/rejected": -1.498552918434143, "logps/chosen": -148.74734497070312, "logps/rejected": -193.10498046875, "loss": 0.9233, "rewards/accuracies": 0.0, "rewards/chosen": 6.1423492431640625, "rewards/margins": -1.6189970970153809, "rewards/rejected": 7.761346340179443, "step": 2281 }, { "epoch": 0.51, "learning_rate": 8.7573273056479e-06, "logits/chosen": -1.505996584892273, "logits/rejected": -1.1608035564422607, "logps/chosen": -35.08867645263672, "logps/rejected": -36.18035125732422, "loss": 0.7177, "rewards/accuracies": 1.0, "rewards/chosen": 3.7948975563049316, "rewards/margins": 0.9799797534942627, "rewards/rejected": 2.814917802810669, "step": 2282 }, { "epoch": 0.51, "learning_rate": 8.756144535616838e-06, "logits/chosen": -1.6932079792022705, "logits/rejected": -1.6807881593704224, "logps/chosen": -53.69074630737305, "logps/rejected": -16.698827743530273, "loss": 0.2651, "rewards/accuracies": 1.0, "rewards/chosen": 3.5291316509246826, "rewards/margins": 1.1453676223754883, "rewards/rejected": 2.3837640285491943, "step": 2283 }, { "epoch": 0.51, "learning_rate": 8.754961282930231e-06, "logits/chosen": -1.6890801191329956, "logits/rejected": -1.6210933923721313, "logps/chosen": -101.94261169433594, "logps/rejected": -74.84417724609375, "loss": 1.0344, "rewards/accuracies": 0.0, "rewards/chosen": 1.23332679271698, "rewards/margins": -1.8901289701461792, "rewards/rejected": 3.123455762863159, "step": 2284 }, { "epoch": 0.51, "learning_rate": 8.753777547740126e-06, "logits/chosen": -1.5519237518310547, "logits/rejected": -1.5551958084106445, "logps/chosen": -49.81241226196289, "logps/rejected": -22.536296844482422, "loss": 0.6411, "rewards/accuracies": 0.0, "rewards/chosen": 1.3299744129180908, "rewards/margins": -0.6156498193740845, "rewards/rejected": 1.9456242322921753, "step": 2285 }, { "epoch": 0.51, "learning_rate": 8.752593330198631e-06, "logits/chosen": -2.047449827194214, "logits/rejected": -1.992653250694275, "logps/chosen": -62.39822006225586, "logps/rejected": -28.041248321533203, "loss": 0.3012, "rewards/accuracies": 1.0, "rewards/chosen": 2.729970932006836, "rewards/margins": 1.598340630531311, "rewards/rejected": 1.131630301475525, "step": 2286 }, { "epoch": 0.51, "learning_rate": 8.751408630457911e-06, "logits/chosen": -1.6097508668899536, "logits/rejected": -1.4984074831008911, "logps/chosen": -102.14363098144531, "logps/rejected": -197.23165893554688, "loss": 0.3433, "rewards/accuracies": 1.0, "rewards/chosen": 5.838539123535156, "rewards/margins": 0.1462721824645996, "rewards/rejected": 5.692266941070557, "step": 2287 }, { "epoch": 0.51, "learning_rate": 8.750223448670204e-06, "logits/chosen": -1.469391107559204, "logits/rejected": -1.4511672258377075, "logps/chosen": -51.31903839111328, "logps/rejected": -84.36934661865234, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": 3.2025582790374756, "rewards/margins": 1.8404206037521362, "rewards/rejected": 1.3621376752853394, "step": 2288 }, { "epoch": 0.51, "learning_rate": 8.749037784987797e-06, "logits/chosen": -1.453176498413086, "logits/rejected": -1.453176498413086, "logps/chosen": -43.99626922607422, "logps/rejected": -43.99626922607422, "loss": 0.3593, "rewards/accuracies": 0.0, "rewards/chosen": 3.7224204540252686, "rewards/margins": 0.0, "rewards/rejected": 3.7224204540252686, "step": 2289 }, { "epoch": 0.51, "learning_rate": 8.747851639563048e-06, "logits/chosen": -1.5607057809829712, "logits/rejected": -1.3834892511367798, "logps/chosen": -107.43930053710938, "logps/rejected": -31.757537841796875, "loss": 2.7758, "rewards/accuracies": 1.0, "rewards/chosen": 2.853741407394409, "rewards/margins": 2.949167013168335, "rewards/rejected": -0.09542560577392578, "step": 2290 }, { "epoch": 0.51, "learning_rate": 8.746665012548373e-06, "logits/chosen": -1.3713250160217285, "logits/rejected": -1.3820266723632812, "logps/chosen": -48.99954605102539, "logps/rejected": -33.09965515136719, "loss": 2.1826, "rewards/accuracies": 0.0, "rewards/chosen": 2.6369855403900146, "rewards/margins": -0.6487827301025391, "rewards/rejected": 3.2857682704925537, "step": 2291 }, { "epoch": 0.51, "learning_rate": 8.745477904096247e-06, "logits/chosen": -1.478420615196228, "logits/rejected": -1.364327311515808, "logps/chosen": -45.89191436767578, "logps/rejected": -44.56189727783203, "loss": 0.9392, "rewards/accuracies": 0.0, "rewards/chosen": 1.5946849584579468, "rewards/margins": -0.8751567602157593, "rewards/rejected": 2.469841718673706, "step": 2292 }, { "epoch": 0.51, "learning_rate": 8.744290314359219e-06, "logits/chosen": -1.5045199394226074, "logits/rejected": -1.4111835956573486, "logps/chosen": -90.06684875488281, "logps/rejected": -78.40998840332031, "loss": 0.8797, "rewards/accuracies": 1.0, "rewards/chosen": 7.094645977020264, "rewards/margins": 2.3750228881835938, "rewards/rejected": 4.71962308883667, "step": 2293 }, { "epoch": 0.51, "learning_rate": 8.743102243489885e-06, "logits/chosen": -1.7229042053222656, "logits/rejected": -1.789455771446228, "logps/chosen": -62.59416198730469, "logps/rejected": -100.75479125976562, "loss": 1.7067, "rewards/accuracies": 0.0, "rewards/chosen": 3.9651100635528564, "rewards/margins": -3.345857858657837, "rewards/rejected": 7.310967922210693, "step": 2294 }, { "epoch": 0.51, "learning_rate": 8.74191369164091e-06, "logits/chosen": -1.8187098503112793, "logits/rejected": -1.8563761711120605, "logps/chosen": -58.74851989746094, "logps/rejected": -62.30113220214844, "loss": 1.357, "rewards/accuracies": 0.0, "rewards/chosen": 2.942331075668335, "rewards/margins": -1.477912187576294, "rewards/rejected": 4.420243263244629, "step": 2295 }, { "epoch": 0.51, "learning_rate": 8.74072465896502e-06, "logits/chosen": -1.4540125131607056, "logits/rejected": -1.4299498796463013, "logps/chosen": -143.18775939941406, "logps/rejected": -85.90206909179688, "loss": 0.1169, "rewards/accuracies": 1.0, "rewards/chosen": 8.13572883605957, "rewards/margins": 1.7932591438293457, "rewards/rejected": 6.342469692230225, "step": 2296 }, { "epoch": 0.51, "learning_rate": 8.739535145615005e-06, "logits/chosen": -1.488540768623352, "logits/rejected": -1.3839075565338135, "logps/chosen": -121.49049377441406, "logps/rejected": -52.54461669921875, "loss": 0.165, "rewards/accuracies": 1.0, "rewards/chosen": 6.776599407196045, "rewards/margins": 3.9860551357269287, "rewards/rejected": 2.790544271469116, "step": 2297 }, { "epoch": 0.51, "learning_rate": 8.738345151743715e-06, "logits/chosen": -1.3037095069885254, "logits/rejected": -1.3736023902893066, "logps/chosen": -51.26396179199219, "logps/rejected": -55.96538543701172, "loss": 1.1905, "rewards/accuracies": 0.0, "rewards/chosen": 2.276437520980835, "rewards/margins": -1.8348290920257568, "rewards/rejected": 4.111266613006592, "step": 2298 }, { "epoch": 0.51, "learning_rate": 8.737154677504059e-06, "logits/chosen": -1.5013413429260254, "logits/rejected": -1.4706628322601318, "logps/chosen": -109.38763427734375, "logps/rejected": -55.1165657043457, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 6.090460300445557, "rewards/margins": 3.732118606567383, "rewards/rejected": 2.358341693878174, "step": 2299 }, { "epoch": 0.51, "learning_rate": 8.73596372304901e-06, "logits/chosen": -1.8479089736938477, "logits/rejected": -1.8598644733428955, "logps/chosen": -54.130409240722656, "logps/rejected": -76.25152587890625, "loss": 1.6495, "rewards/accuracies": 0.0, "rewards/chosen": 3.70609974861145, "rewards/margins": -2.508951425552368, "rewards/rejected": 6.215051174163818, "step": 2300 }, { "epoch": 0.51, "learning_rate": 8.734772288531604e-06, "logits/chosen": -1.2875515222549438, "logits/rejected": -1.3870009183883667, "logps/chosen": -13.149106979370117, "logps/rejected": -92.3422622680664, "loss": 3.5042, "rewards/accuracies": 0.0, "rewards/chosen": 1.4634422063827515, "rewards/margins": -4.969915866851807, "rewards/rejected": 6.433358192443848, "step": 2301 }, { "epoch": 0.51, "learning_rate": 8.733580374104936e-06, "logits/chosen": -1.571854591369629, "logits/rejected": -1.5179489850997925, "logps/chosen": -67.68648529052734, "logps/rejected": -47.87791442871094, "loss": 1.2511, "rewards/accuracies": 0.0, "rewards/chosen": 2.334714651107788, "rewards/margins": -1.2439489364624023, "rewards/rejected": 3.5786635875701904, "step": 2302 }, { "epoch": 0.51, "learning_rate": 8.732387979922167e-06, "logits/chosen": -1.8342264890670776, "logits/rejected": -1.7838730812072754, "logps/chosen": -94.96025848388672, "logps/rejected": -41.66356658935547, "loss": 0.5234, "rewards/accuracies": 0.0, "rewards/chosen": 3.735060930252075, "rewards/margins": -0.22048258781433105, "rewards/rejected": 3.9555435180664062, "step": 2303 }, { "epoch": 0.51, "learning_rate": 8.731195106136515e-06, "logits/chosen": -1.667670488357544, "logits/rejected": -1.6047908067703247, "logps/chosen": -59.93220520019531, "logps/rejected": -66.74800109863281, "loss": 0.6922, "rewards/accuracies": 0.0, "rewards/chosen": 3.1759591102600098, "rewards/margins": -1.0709218978881836, "rewards/rejected": 4.246881008148193, "step": 2304 }, { "epoch": 0.51, "learning_rate": 8.730001752901258e-06, "logits/chosen": -1.236458659172058, "logits/rejected": -1.2464478015899658, "logps/chosen": -167.00250244140625, "logps/rejected": -182.0145263671875, "loss": 1.2889, "rewards/accuracies": 0.0, "rewards/chosen": 6.277951240539551, "rewards/margins": -2.024993896484375, "rewards/rejected": 8.302945137023926, "step": 2305 }, { "epoch": 0.51, "learning_rate": 8.728807920369747e-06, "logits/chosen": -1.4953542947769165, "logits/rejected": -1.3302851915359497, "logps/chosen": -90.66667175292969, "logps/rejected": -64.55204010009766, "loss": 0.4701, "rewards/accuracies": 1.0, "rewards/chosen": 5.7143449783325195, "rewards/margins": 2.086422920227051, "rewards/rejected": 3.6279220581054688, "step": 2306 }, { "epoch": 0.51, "learning_rate": 8.727613608695379e-06, "logits/chosen": -1.7974895238876343, "logits/rejected": -1.7079285383224487, "logps/chosen": -64.84521484375, "logps/rejected": -34.57633972167969, "loss": 0.1424, "rewards/accuracies": 1.0, "rewards/chosen": 3.1794564723968506, "rewards/margins": 1.7189453840255737, "rewards/rejected": 1.4605110883712769, "step": 2307 }, { "epoch": 0.51, "learning_rate": 8.726418818031623e-06, "logits/chosen": -1.4465935230255127, "logits/rejected": -1.4465935230255127, "logps/chosen": -48.132179260253906, "logps/rejected": -48.132179260253906, "loss": 0.3546, "rewards/accuracies": 0.0, "rewards/chosen": 3.4176254272460938, "rewards/margins": 0.0, "rewards/rejected": 3.4176254272460938, "step": 2308 }, { "epoch": 0.51, "learning_rate": 8.72522354853201e-06, "logits/chosen": -1.4546836614608765, "logits/rejected": -1.467728853225708, "logps/chosen": -47.39521789550781, "logps/rejected": -85.61112213134766, "loss": 0.3684, "rewards/accuracies": 1.0, "rewards/chosen": 4.348237037658691, "rewards/margins": 0.005964756011962891, "rewards/rejected": 4.3422722816467285, "step": 2309 }, { "epoch": 0.51, "learning_rate": 8.724027800350123e-06, "logits/chosen": -1.4798203706741333, "logits/rejected": -1.433600902557373, "logps/chosen": -61.88463592529297, "logps/rejected": -68.49664306640625, "loss": 0.4182, "rewards/accuracies": 0.0, "rewards/chosen": 1.8734062910079956, "rewards/margins": -0.2569533586502075, "rewards/rejected": 2.130359649658203, "step": 2310 }, { "epoch": 0.51, "learning_rate": 8.722831573639618e-06, "logits/chosen": -1.6971713304519653, "logits/rejected": -1.6313705444335938, "logps/chosen": -109.52183532714844, "logps/rejected": -67.19718170166016, "loss": 0.1304, "rewards/accuracies": 1.0, "rewards/chosen": 6.072441101074219, "rewards/margins": 1.245866298675537, "rewards/rejected": 4.826574802398682, "step": 2311 }, { "epoch": 0.51, "learning_rate": 8.721634868554204e-06, "logits/chosen": -1.4087631702423096, "logits/rejected": -1.402733564376831, "logps/chosen": -116.22824096679688, "logps/rejected": -82.06945037841797, "loss": 2.1294, "rewards/accuracies": 1.0, "rewards/chosen": 6.230618476867676, "rewards/margins": 1.5545587539672852, "rewards/rejected": 4.676059722900391, "step": 2312 }, { "epoch": 0.51, "learning_rate": 8.720437685247657e-06, "logits/chosen": -1.808691382408142, "logits/rejected": -1.8068668842315674, "logps/chosen": -125.26847839355469, "logps/rejected": -104.2667236328125, "loss": 1.4478, "rewards/accuracies": 0.0, "rewards/chosen": 5.097978115081787, "rewards/margins": -2.7834243774414062, "rewards/rejected": 7.881402492523193, "step": 2313 }, { "epoch": 0.51, "learning_rate": 8.719240023873809e-06, "logits/chosen": -1.349473476409912, "logits/rejected": -1.1712945699691772, "logps/chosen": -107.775634765625, "logps/rejected": -37.34124755859375, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 5.586678981781006, "rewards/margins": 2.904797315597534, "rewards/rejected": 2.6818816661834717, "step": 2314 }, { "epoch": 0.51, "learning_rate": 8.71804188458656e-06, "logits/chosen": -1.8386948108673096, "logits/rejected": -1.8291741609573364, "logps/chosen": -130.83152770996094, "logps/rejected": -71.00218200683594, "loss": 1.2816, "rewards/accuracies": 0.0, "rewards/chosen": 5.2987380027771, "rewards/margins": -2.1670684814453125, "rewards/rejected": 7.465806484222412, "step": 2315 }, { "epoch": 0.51, "learning_rate": 8.716843267539868e-06, "logits/chosen": -1.5882855653762817, "logits/rejected": -1.6098824739456177, "logps/chosen": -34.46078109741211, "logps/rejected": -63.813228607177734, "loss": 1.9113, "rewards/accuracies": 1.0, "rewards/chosen": 2.4195759296417236, "rewards/margins": 0.05467534065246582, "rewards/rejected": 2.364900588989258, "step": 2316 }, { "epoch": 0.51, "learning_rate": 8.715644172887751e-06, "logits/chosen": -1.460593819618225, "logits/rejected": -1.3796430826187134, "logps/chosen": -139.0686492919922, "logps/rejected": -72.0142822265625, "loss": 0.5246, "rewards/accuracies": 1.0, "rewards/chosen": 6.637510776519775, "rewards/margins": 2.8962478637695312, "rewards/rejected": 3.741262912750244, "step": 2317 }, { "epoch": 0.51, "learning_rate": 8.714444600784289e-06, "logits/chosen": -1.754392147064209, "logits/rejected": -1.7652426958084106, "logps/chosen": -84.54705810546875, "logps/rejected": -75.75836944580078, "loss": 1.9883, "rewards/accuracies": 0.0, "rewards/chosen": 3.422072649002075, "rewards/margins": -3.7425291538238525, "rewards/rejected": 7.164601802825928, "step": 2318 }, { "epoch": 0.51, "learning_rate": 8.713244551383626e-06, "logits/chosen": -1.8636384010314941, "logits/rejected": -1.8301494121551514, "logps/chosen": -90.1414566040039, "logps/rejected": -35.261619567871094, "loss": 1.2777, "rewards/accuracies": 1.0, "rewards/chosen": 5.87017297744751, "rewards/margins": 5.216379642486572, "rewards/rejected": 0.6537933349609375, "step": 2319 }, { "epoch": 0.51, "learning_rate": 8.712044024839962e-06, "logits/chosen": -1.375321388244629, "logits/rejected": -1.2527759075164795, "logps/chosen": -102.36067199707031, "logps/rejected": -45.55009460449219, "loss": 0.091, "rewards/accuracies": 1.0, "rewards/chosen": 5.297216892242432, "rewards/margins": 1.7583587169647217, "rewards/rejected": 3.53885817527771, "step": 2320 }, { "epoch": 0.51, "learning_rate": 8.710843021307567e-06, "logits/chosen": -1.745518445968628, "logits/rejected": -1.7350300550460815, "logps/chosen": -100.66897583007812, "logps/rejected": -150.86380004882812, "loss": 1.4281, "rewards/accuracies": 1.0, "rewards/chosen": 5.625526428222656, "rewards/margins": 2.6489455699920654, "rewards/rejected": 2.976580858230591, "step": 2321 }, { "epoch": 0.51, "learning_rate": 8.709641540940764e-06, "logits/chosen": -1.527255654335022, "logits/rejected": -1.4295920133590698, "logps/chosen": -78.89979553222656, "logps/rejected": -29.58171844482422, "loss": 0.4318, "rewards/accuracies": 1.0, "rewards/chosen": 6.6502885818481445, "rewards/margins": 3.6464288234710693, "rewards/rejected": 3.003859758377075, "step": 2322 }, { "epoch": 0.51, "learning_rate": 8.70843958389394e-06, "logits/chosen": -1.5468900203704834, "logits/rejected": -1.5815277099609375, "logps/chosen": -71.43499755859375, "logps/rejected": -92.75159454345703, "loss": 3.3908, "rewards/accuracies": 0.0, "rewards/chosen": 1.5576202869415283, "rewards/margins": -6.655269622802734, "rewards/rejected": 8.212889671325684, "step": 2323 }, { "epoch": 0.51, "learning_rate": 8.707237150321544e-06, "logits/chosen": -1.3815652132034302, "logits/rejected": -1.4576196670532227, "logps/chosen": -54.179962158203125, "logps/rejected": -98.96732330322266, "loss": 1.3345, "rewards/accuracies": 0.0, "rewards/chosen": 4.075299263000488, "rewards/margins": -2.582979679107666, "rewards/rejected": 6.658278942108154, "step": 2324 }, { "epoch": 0.51, "learning_rate": 8.706034240378087e-06, "logits/chosen": -1.3391778469085693, "logits/rejected": -1.3143771886825562, "logps/chosen": -28.65252113342285, "logps/rejected": -97.42005157470703, "loss": 0.7478, "rewards/accuracies": 0.0, "rewards/chosen": 2.2912251949310303, "rewards/margins": -0.6704676151275635, "rewards/rejected": 2.9616928100585938, "step": 2325 }, { "epoch": 0.51, "learning_rate": 8.704830854218138e-06, "logits/chosen": -1.5363942384719849, "logits/rejected": -1.4534556865692139, "logps/chosen": -67.18704986572266, "logps/rejected": -20.25249481201172, "loss": 0.4211, "rewards/accuracies": 1.0, "rewards/chosen": 3.552204132080078, "rewards/margins": 0.8635027408599854, "rewards/rejected": 2.6887013912200928, "step": 2326 }, { "epoch": 0.52, "learning_rate": 8.703626991996333e-06, "logits/chosen": -1.351348638534546, "logits/rejected": -1.351348638534546, "logps/chosen": -11.15478515625, "logps/rejected": -11.15478515625, "loss": 0.4979, "rewards/accuracies": 0.0, "rewards/chosen": 2.744957685470581, "rewards/margins": 0.0, "rewards/rejected": 2.744957685470581, "step": 2327 }, { "epoch": 0.52, "learning_rate": 8.70242265386736e-06, "logits/chosen": -1.7156306505203247, "logits/rejected": -1.7156306505203247, "logps/chosen": -98.02436828613281, "logps/rejected": -98.02436828613281, "loss": 0.487, "rewards/accuracies": 0.0, "rewards/chosen": 7.942070007324219, "rewards/margins": 0.0, "rewards/rejected": 7.942070007324219, "step": 2328 }, { "epoch": 0.52, "learning_rate": 8.701217839985978e-06, "logits/chosen": -1.2627888917922974, "logits/rejected": -1.1857390403747559, "logps/chosen": -49.114078521728516, "logps/rejected": -51.35137176513672, "loss": 0.0855, "rewards/accuracies": 1.0, "rewards/chosen": 4.042351245880127, "rewards/margins": 1.8779339790344238, "rewards/rejected": 2.164417266845703, "step": 2329 }, { "epoch": 0.52, "learning_rate": 8.700012550507e-06, "logits/chosen": -1.7018451690673828, "logits/rejected": -1.7312688827514648, "logps/chosen": -65.88487243652344, "logps/rejected": -108.1412353515625, "loss": 1.1055, "rewards/accuracies": 0.0, "rewards/chosen": 3.8670547008514404, "rewards/margins": -1.7218949794769287, "rewards/rejected": 5.588949680328369, "step": 2330 }, { "epoch": 0.52, "learning_rate": 8.698806785585305e-06, "logits/chosen": -1.5966445207595825, "logits/rejected": -1.1686979532241821, "logps/chosen": -83.86209869384766, "logps/rejected": -65.75299835205078, "loss": 0.2572, "rewards/accuracies": 1.0, "rewards/chosen": 6.665119171142578, "rewards/margins": 4.140918731689453, "rewards/rejected": 2.524200439453125, "step": 2331 }, { "epoch": 0.52, "learning_rate": 8.697600545375829e-06, "logits/chosen": -1.5741877555847168, "logits/rejected": -1.5151387453079224, "logps/chosen": -57.58654022216797, "logps/rejected": -12.074238777160645, "loss": 1.3852, "rewards/accuracies": 1.0, "rewards/chosen": 3.831075429916382, "rewards/margins": 3.090593099594116, "rewards/rejected": 0.7404822707176208, "step": 2332 }, { "epoch": 0.52, "learning_rate": 8.696393830033571e-06, "logits/chosen": -1.69606614112854, "logits/rejected": -1.7057409286499023, "logps/chosen": -53.984676361083984, "logps/rejected": -71.81942749023438, "loss": 0.5078, "rewards/accuracies": 0.0, "rewards/chosen": 4.16279935836792, "rewards/margins": -0.26236534118652344, "rewards/rejected": 4.425164699554443, "step": 2333 }, { "epoch": 0.52, "learning_rate": 8.695186639713593e-06, "logits/chosen": -1.4012857675552368, "logits/rejected": -1.418483853340149, "logps/chosen": -155.706298828125, "logps/rejected": -84.22828674316406, "loss": 0.2394, "rewards/accuracies": 1.0, "rewards/chosen": 7.306787014007568, "rewards/margins": 0.5041103363037109, "rewards/rejected": 6.802676677703857, "step": 2334 }, { "epoch": 0.52, "learning_rate": 8.693978974571013e-06, "logits/chosen": -1.2197668552398682, "logits/rejected": -1.2684913873672485, "logps/chosen": -87.23081970214844, "logps/rejected": -59.040340423583984, "loss": 2.0249, "rewards/accuracies": 0.0, "rewards/chosen": 2.6484153270721436, "rewards/margins": -3.775524377822876, "rewards/rejected": 6.4239397048950195, "step": 2335 }, { "epoch": 0.52, "learning_rate": 8.692770834761017e-06, "logits/chosen": -1.660723090171814, "logits/rejected": -1.6233000755310059, "logps/chosen": -47.689170837402344, "logps/rejected": -62.25144958496094, "loss": 0.5402, "rewards/accuracies": 1.0, "rewards/chosen": 2.754077196121216, "rewards/margins": 0.1638197898864746, "rewards/rejected": 2.590257406234741, "step": 2336 }, { "epoch": 0.52, "learning_rate": 8.691562220438845e-06, "logits/chosen": -1.2684024572372437, "logits/rejected": -1.2684024572372437, "logps/chosen": -38.76886749267578, "logps/rejected": -38.76886749267578, "loss": 0.3951, "rewards/accuracies": 0.0, "rewards/chosen": 3.928852081298828, "rewards/margins": 0.0, "rewards/rejected": 3.928852081298828, "step": 2337 }, { "epoch": 0.52, "learning_rate": 8.690353131759802e-06, "logits/chosen": -1.5993777513504028, "logits/rejected": -1.5830605030059814, "logps/chosen": -35.745216369628906, "logps/rejected": -54.71603775024414, "loss": 1.2594, "rewards/accuracies": 1.0, "rewards/chosen": 3.0635406970977783, "rewards/margins": 0.6457462310791016, "rewards/rejected": 2.4177944660186768, "step": 2338 }, { "epoch": 0.52, "learning_rate": 8.689143568879252e-06, "logits/chosen": -1.540840983390808, "logits/rejected": -1.428053855895996, "logps/chosen": -86.23524475097656, "logps/rejected": -57.82844161987305, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 5.9696245193481445, "rewards/margins": 3.831005573272705, "rewards/rejected": 2.1386189460754395, "step": 2339 }, { "epoch": 0.52, "learning_rate": 8.687933531952624e-06, "logits/chosen": -1.5051207542419434, "logits/rejected": -1.0984854698181152, "logps/chosen": -100.42366027832031, "logps/rejected": -60.52918243408203, "loss": 0.7078, "rewards/accuracies": 1.0, "rewards/chosen": 6.241156101226807, "rewards/margins": 0.9879608154296875, "rewards/rejected": 5.253195285797119, "step": 2340 }, { "epoch": 0.52, "learning_rate": 8.686723021135402e-06, "logits/chosen": -1.4948707818984985, "logits/rejected": -1.4124623537063599, "logps/chosen": -48.147647857666016, "logps/rejected": -15.202791213989258, "loss": 2.2593, "rewards/accuracies": 1.0, "rewards/chosen": 2.115905523300171, "rewards/margins": 1.5885764360427856, "rewards/rejected": 0.5273290872573853, "step": 2341 }, { "epoch": 0.52, "learning_rate": 8.685512036583132e-06, "logits/chosen": -1.5306750535964966, "logits/rejected": -1.5931721925735474, "logps/chosen": -44.60099792480469, "logps/rejected": -95.54736328125, "loss": 1.9884, "rewards/accuracies": 0.0, "rewards/chosen": 3.006028890609741, "rewards/margins": -3.2454164028167725, "rewards/rejected": 6.251445293426514, "step": 2342 }, { "epoch": 0.52, "learning_rate": 8.684300578451428e-06, "logits/chosen": -1.548454999923706, "logits/rejected": -1.4602158069610596, "logps/chosen": -96.435546875, "logps/rejected": -40.88550567626953, "loss": 0.8035, "rewards/accuracies": 0.0, "rewards/chosen": 2.4496071338653564, "rewards/margins": -1.3726425170898438, "rewards/rejected": 3.8222496509552, "step": 2343 }, { "epoch": 0.52, "learning_rate": 8.683088646895955e-06, "logits/chosen": -1.575178861618042, "logits/rejected": -1.5145432949066162, "logps/chosen": -54.053714752197266, "logps/rejected": -40.4857063293457, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 3.4169414043426514, "rewards/margins": 1.3821182250976562, "rewards/rejected": 2.034823179244995, "step": 2344 }, { "epoch": 0.52, "learning_rate": 8.681876242072445e-06, "logits/chosen": -1.4120681285858154, "logits/rejected": -1.3765084743499756, "logps/chosen": -130.94229125976562, "logps/rejected": -50.68147277832031, "loss": 0.9411, "rewards/accuracies": 1.0, "rewards/chosen": 7.647439479827881, "rewards/margins": 4.07899284362793, "rewards/rejected": 3.568446397781372, "step": 2345 }, { "epoch": 0.52, "learning_rate": 8.68066336413669e-06, "logits/chosen": -1.5676884651184082, "logits/rejected": -1.4574710130691528, "logps/chosen": -111.77926635742188, "logps/rejected": -142.3472900390625, "loss": 1.4713, "rewards/accuracies": 0.0, "rewards/chosen": 5.174258708953857, "rewards/margins": -2.0108261108398438, "rewards/rejected": 7.185084819793701, "step": 2346 }, { "epoch": 0.52, "learning_rate": 8.67945001324454e-06, "logits/chosen": -1.5380128622055054, "logits/rejected": -1.5144705772399902, "logps/chosen": -45.14173126220703, "logps/rejected": -63.35544967651367, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 4.451793670654297, "rewards/margins": 2.4210894107818604, "rewards/rejected": 2.0307042598724365, "step": 2347 }, { "epoch": 0.52, "learning_rate": 8.678236189551907e-06, "logits/chosen": -1.4937478303909302, "logits/rejected": -1.573673963546753, "logps/chosen": -75.18598937988281, "logps/rejected": -138.3086395263672, "loss": 2.0528, "rewards/accuracies": 0.0, "rewards/chosen": 4.697704315185547, "rewards/margins": -4.01278018951416, "rewards/rejected": 8.710484504699707, "step": 2348 }, { "epoch": 0.52, "learning_rate": 8.677021893214768e-06, "logits/chosen": -1.5356265306472778, "logits/rejected": -1.4116709232330322, "logps/chosen": -126.24676513671875, "logps/rejected": -54.0718994140625, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": 6.2140350341796875, "rewards/margins": 2.8424322605133057, "rewards/rejected": 3.371602773666382, "step": 2349 }, { "epoch": 0.52, "learning_rate": 8.675807124389153e-06, "logits/chosen": -1.4015330076217651, "logits/rejected": -1.3773469924926758, "logps/chosen": -80.73257446289062, "logps/rejected": -46.46110153198242, "loss": 0.6746, "rewards/accuracies": 0.0, "rewards/chosen": 2.0009515285491943, "rewards/margins": -0.9985880851745605, "rewards/rejected": 2.999539613723755, "step": 2350 }, { "epoch": 0.52, "learning_rate": 8.67459188323116e-06, "logits/chosen": -1.5998650789260864, "logits/rejected": -1.6590733528137207, "logps/chosen": -142.5517578125, "logps/rejected": -68.75654602050781, "loss": 0.8366, "rewards/accuracies": 0.0, "rewards/chosen": 5.420897006988525, "rewards/margins": -1.456151008605957, "rewards/rejected": 6.877048015594482, "step": 2351 }, { "epoch": 0.52, "learning_rate": 8.673376169896944e-06, "logits/chosen": -1.8284910917282104, "logits/rejected": -1.8251405954360962, "logps/chosen": -136.42086791992188, "logps/rejected": -148.5133514404297, "loss": 2.2784, "rewards/accuracies": 0.0, "rewards/chosen": 6.060595989227295, "rewards/margins": -4.104625225067139, "rewards/rejected": 10.165221214294434, "step": 2352 }, { "epoch": 0.52, "learning_rate": 8.672159984542721e-06, "logits/chosen": -1.4717320203781128, "logits/rejected": -1.3265132904052734, "logps/chosen": -67.25807189941406, "logps/rejected": -7.3867974281311035, "loss": 0.3877, "rewards/accuracies": 1.0, "rewards/chosen": 2.623539686203003, "rewards/margins": 1.7129089832305908, "rewards/rejected": 0.9106306433677673, "step": 2353 }, { "epoch": 0.52, "learning_rate": 8.670943327324767e-06, "logits/chosen": -1.7620577812194824, "logits/rejected": -1.6435550451278687, "logps/chosen": -83.87055206298828, "logps/rejected": -25.456382751464844, "loss": 1.2617, "rewards/accuracies": 1.0, "rewards/chosen": 4.186509132385254, "rewards/margins": 3.4560563564300537, "rewards/rejected": 0.7304527163505554, "step": 2354 }, { "epoch": 0.52, "learning_rate": 8.66972619839942e-06, "logits/chosen": -1.3935596942901611, "logits/rejected": -1.3097519874572754, "logps/chosen": -45.13111114501953, "logps/rejected": -5.941564559936523, "loss": 0.7196, "rewards/accuracies": 1.0, "rewards/chosen": 3.967200517654419, "rewards/margins": 2.7240843772888184, "rewards/rejected": 1.2431162595748901, "step": 2355 }, { "epoch": 0.52, "learning_rate": 8.668508597923077e-06, "logits/chosen": -1.5465811491012573, "logits/rejected": -1.5383057594299316, "logps/chosen": -57.00751495361328, "logps/rejected": -46.533302307128906, "loss": 1.6183, "rewards/accuracies": 0.0, "rewards/chosen": 2.8554940223693848, "rewards/margins": -1.4021391868591309, "rewards/rejected": 4.257633209228516, "step": 2356 }, { "epoch": 0.52, "learning_rate": 8.6672905260522e-06, "logits/chosen": -1.3459731340408325, "logits/rejected": -1.3598476648330688, "logps/chosen": -55.78345489501953, "logps/rejected": -67.00504302978516, "loss": 0.2103, "rewards/accuracies": 1.0, "rewards/chosen": 4.4413886070251465, "rewards/margins": 1.0830564498901367, "rewards/rejected": 3.3583321571350098, "step": 2357 }, { "epoch": 0.52, "learning_rate": 8.666071982943306e-06, "logits/chosen": -1.6576013565063477, "logits/rejected": -1.5120744705200195, "logps/chosen": -107.38093566894531, "logps/rejected": -57.55809020996094, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": 6.597212314605713, "rewards/margins": 2.1090927124023438, "rewards/rejected": 4.488119602203369, "step": 2358 }, { "epoch": 0.52, "learning_rate": 8.664852968752975e-06, "logits/chosen": -1.4174630641937256, "logits/rejected": -1.3842175006866455, "logps/chosen": -53.43628692626953, "logps/rejected": -69.54885864257812, "loss": 0.5208, "rewards/accuracies": 0.0, "rewards/chosen": 2.9413955211639404, "rewards/margins": -0.6050844192504883, "rewards/rejected": 3.5464799404144287, "step": 2359 }, { "epoch": 0.52, "learning_rate": 8.663633483637847e-06, "logits/chosen": -1.3963273763656616, "logits/rejected": -1.3542481660842896, "logps/chosen": -50.860496520996094, "logps/rejected": -51.72372817993164, "loss": 1.8759, "rewards/accuracies": 0.0, "rewards/chosen": 1.6352760791778564, "rewards/margins": -3.052246332168579, "rewards/rejected": 4.6875224113464355, "step": 2360 }, { "epoch": 0.52, "learning_rate": 8.662413527754624e-06, "logits/chosen": -1.7577440738677979, "logits/rejected": -1.6618610620498657, "logps/chosen": -105.43843841552734, "logps/rejected": -106.31448364257812, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": 6.657267093658447, "rewards/margins": 3.236762285232544, "rewards/rejected": 3.4205048084259033, "step": 2361 }, { "epoch": 0.52, "learning_rate": 8.661193101260067e-06, "logits/chosen": -1.8149840831756592, "logits/rejected": -1.8173959255218506, "logps/chosen": -5.013628959655762, "logps/rejected": -54.1943359375, "loss": 0.6215, "rewards/accuracies": 0.0, "rewards/chosen": 0.8139618039131165, "rewards/margins": -0.54677814245224, "rewards/rejected": 1.3607399463653564, "step": 2362 }, { "epoch": 0.52, "learning_rate": 8.659972204310998e-06, "logits/chosen": -1.6994036436080933, "logits/rejected": -1.5137436389923096, "logps/chosen": -123.4668960571289, "logps/rejected": -28.515811920166016, "loss": 0.1348, "rewards/accuracies": 1.0, "rewards/chosen": 6.361733436584473, "rewards/margins": 6.085916519165039, "rewards/rejected": 0.27581673860549927, "step": 2363 }, { "epoch": 0.52, "learning_rate": 8.658750837064299e-06, "logits/chosen": -1.4374350309371948, "logits/rejected": -1.407891035079956, "logps/chosen": -49.58856201171875, "logps/rejected": -68.02437591552734, "loss": 0.3241, "rewards/accuracies": 1.0, "rewards/chosen": 3.4995834827423096, "rewards/margins": 0.10415267944335938, "rewards/rejected": 3.39543080329895, "step": 2364 }, { "epoch": 0.52, "learning_rate": 8.657528999676912e-06, "logits/chosen": -1.7859662771224976, "logits/rejected": -1.6785671710968018, "logps/chosen": -145.11203002929688, "logps/rejected": -47.300601959228516, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": 8.60391902923584, "rewards/margins": 6.119362831115723, "rewards/rejected": 2.484556198120117, "step": 2365 }, { "epoch": 0.52, "learning_rate": 8.65630669230584e-06, "logits/chosen": -1.4010618925094604, "logits/rejected": -1.398080825805664, "logps/chosen": -97.95999908447266, "logps/rejected": -67.410888671875, "loss": 0.1082, "rewards/accuracies": 1.0, "rewards/chosen": 6.5867438316345215, "rewards/margins": 1.5230259895324707, "rewards/rejected": 5.063717842102051, "step": 2366 }, { "epoch": 0.52, "learning_rate": 8.65508391510815e-06, "logits/chosen": -1.3963483572006226, "logits/rejected": -1.3963483572006226, "logps/chosen": -37.834136962890625, "logps/rejected": -37.834136962890625, "loss": 0.64, "rewards/accuracies": 0.0, "rewards/chosen": 4.198520660400391, "rewards/margins": 0.0, "rewards/rejected": 4.198520660400391, "step": 2367 }, { "epoch": 0.52, "learning_rate": 8.653860668240963e-06, "logits/chosen": -1.8904331922531128, "logits/rejected": -1.9192897081375122, "logps/chosen": -125.9615478515625, "logps/rejected": -108.00804138183594, "loss": 0.7729, "rewards/accuracies": 0.0, "rewards/chosen": 4.159509181976318, "rewards/margins": -1.296863079071045, "rewards/rejected": 5.456372261047363, "step": 2368 }, { "epoch": 0.52, "learning_rate": 8.652636951861463e-06, "logits/chosen": -1.2864768505096436, "logits/rejected": -1.2166626453399658, "logps/chosen": -75.12937927246094, "logps/rejected": -90.8283462524414, "loss": 0.7071, "rewards/accuracies": 0.0, "rewards/chosen": 2.2851624488830566, "rewards/margins": -1.0922737121582031, "rewards/rejected": 3.3774361610412598, "step": 2369 }, { "epoch": 0.52, "learning_rate": 8.651412766126896e-06, "logits/chosen": -1.7256042957305908, "logits/rejected": -1.6967177391052246, "logps/chosen": -72.87992858886719, "logps/rejected": -72.96270751953125, "loss": 0.6442, "rewards/accuracies": 0.0, "rewards/chosen": 3.120753526687622, "rewards/margins": -0.5768036842346191, "rewards/rejected": 3.697557210922241, "step": 2370 }, { "epoch": 0.52, "learning_rate": 8.650188111194565e-06, "logits/chosen": -1.4450232982635498, "logits/rejected": -1.2960015535354614, "logps/chosen": -93.703125, "logps/rejected": -79.17332458496094, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 5.489798069000244, "rewards/margins": 3.1933236122131348, "rewards/rejected": 2.2964744567871094, "step": 2371 }, { "epoch": 0.53, "learning_rate": 8.648962987221837e-06, "logits/chosen": -1.55059015750885, "logits/rejected": -1.517848014831543, "logps/chosen": -65.22163391113281, "logps/rejected": -76.97235107421875, "loss": 0.8584, "rewards/accuracies": 1.0, "rewards/chosen": 4.370116710662842, "rewards/margins": 1.142643928527832, "rewards/rejected": 3.2274727821350098, "step": 2372 }, { "epoch": 0.53, "learning_rate": 8.647737394366138e-06, "logits/chosen": -1.7173856496810913, "logits/rejected": -1.6620290279388428, "logps/chosen": -52.36786651611328, "logps/rejected": -73.9897232055664, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 2.0010926723480225, "rewards/margins": 1.917905569076538, "rewards/rejected": 0.08318710327148438, "step": 2373 }, { "epoch": 0.53, "learning_rate": 8.646511332784953e-06, "logits/chosen": -1.638495922088623, "logits/rejected": -1.5762906074523926, "logps/chosen": -127.35142517089844, "logps/rejected": -71.06430053710938, "loss": 0.2803, "rewards/accuracies": 1.0, "rewards/chosen": 4.283506870269775, "rewards/margins": 0.7641556262969971, "rewards/rejected": 3.5193512439727783, "step": 2374 }, { "epoch": 0.53, "learning_rate": 8.645284802635827e-06, "logits/chosen": -1.5875523090362549, "logits/rejected": -1.4136958122253418, "logps/chosen": -80.32158660888672, "logps/rejected": -10.459739685058594, "loss": 0.1907, "rewards/accuracies": 1.0, "rewards/chosen": 4.204103946685791, "rewards/margins": 2.6470184326171875, "rewards/rejected": 1.557085633277893, "step": 2375 }, { "epoch": 0.53, "learning_rate": 8.644057804076367e-06, "logits/chosen": -1.446494460105896, "logits/rejected": -1.310029149055481, "logps/chosen": -56.68132781982422, "logps/rejected": -22.94100570678711, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": 2.7035233974456787, "rewards/margins": 2.7762513160705566, "rewards/rejected": -0.07272797077894211, "step": 2376 }, { "epoch": 0.53, "learning_rate": 8.642830337264239e-06, "logits/chosen": -1.2475500106811523, "logits/rejected": -1.177597165107727, "logps/chosen": -31.774803161621094, "logps/rejected": -10.625425338745117, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 3.535598039627075, "rewards/margins": 2.6006953716278076, "rewards/rejected": 0.9349026083946228, "step": 2377 }, { "epoch": 0.53, "learning_rate": 8.641602402357168e-06, "logits/chosen": -1.7998284101486206, "logits/rejected": -1.7998284101486206, "logps/chosen": -53.66958236694336, "logps/rejected": -53.66958236694336, "loss": 0.9548, "rewards/accuracies": 0.0, "rewards/chosen": 1.9753795862197876, "rewards/margins": 0.0, "rewards/rejected": 1.9753795862197876, "step": 2378 }, { "epoch": 0.53, "learning_rate": 8.640373999512946e-06, "logits/chosen": -1.4012305736541748, "logits/rejected": -1.4552429914474487, "logps/chosen": -76.89234924316406, "logps/rejected": -79.66197204589844, "loss": 0.685, "rewards/accuracies": 0.0, "rewards/chosen": 2.8651719093322754, "rewards/margins": -0.779303789138794, "rewards/rejected": 3.6444756984710693, "step": 2379 }, { "epoch": 0.53, "learning_rate": 8.639145128889415e-06, "logits/chosen": -1.2471246719360352, "logits/rejected": -1.2471246719360352, "logps/chosen": -16.349300384521484, "logps/rejected": -16.349300384521484, "loss": 1.8919, "rewards/accuracies": 0.0, "rewards/chosen": 1.4082984924316406, "rewards/margins": 0.0, "rewards/rejected": 1.4082984924316406, "step": 2380 }, { "epoch": 0.53, "learning_rate": 8.637915790644482e-06, "logits/chosen": -1.4368056058883667, "logits/rejected": -1.4455159902572632, "logps/chosen": -55.09333801269531, "logps/rejected": -68.15264129638672, "loss": 0.4524, "rewards/accuracies": 0.0, "rewards/chosen": 2.293170213699341, "rewards/margins": -0.3463249206542969, "rewards/rejected": 2.6394951343536377, "step": 2381 }, { "epoch": 0.53, "learning_rate": 8.636685984936115e-06, "logits/chosen": -1.3596080541610718, "logits/rejected": -1.4567044973373413, "logps/chosen": -105.28363037109375, "logps/rejected": -111.81015014648438, "loss": 1.4728, "rewards/accuracies": 0.0, "rewards/chosen": 5.240730285644531, "rewards/margins": -2.7859363555908203, "rewards/rejected": 8.026666641235352, "step": 2382 }, { "epoch": 0.53, "learning_rate": 8.635455711922343e-06, "logits/chosen": -1.713517665863037, "logits/rejected": -1.7003283500671387, "logps/chosen": -34.518760681152344, "logps/rejected": -40.45952606201172, "loss": 0.2918, "rewards/accuracies": 1.0, "rewards/chosen": 2.9098517894744873, "rewards/margins": 0.23340296745300293, "rewards/rejected": 2.6764488220214844, "step": 2383 }, { "epoch": 0.53, "learning_rate": 8.634224971761251e-06, "logits/chosen": -1.6022517681121826, "logits/rejected": -1.6096652746200562, "logps/chosen": -48.68279266357422, "logps/rejected": -55.909828186035156, "loss": 1.2858, "rewards/accuracies": 0.0, "rewards/chosen": 1.428765892982483, "rewards/margins": -1.465659260749817, "rewards/rejected": 2.8944251537323, "step": 2384 }, { "epoch": 0.53, "learning_rate": 8.632993764610986e-06, "logits/chosen": -1.6008939743041992, "logits/rejected": -1.6679456233978271, "logps/chosen": -57.56376266479492, "logps/rejected": -96.11127471923828, "loss": 3.9828, "rewards/accuracies": 0.0, "rewards/chosen": 2.738130569458008, "rewards/margins": -3.8366599082946777, "rewards/rejected": 6.5747904777526855, "step": 2385 }, { "epoch": 0.53, "learning_rate": 8.631762090629756e-06, "logits/chosen": -1.5282551050186157, "logits/rejected": -1.5282551050186157, "logps/chosen": -69.1880111694336, "logps/rejected": -69.1880111694336, "loss": 0.3472, "rewards/accuracies": 0.0, "rewards/chosen": 3.630835771560669, "rewards/margins": 0.0, "rewards/rejected": 3.630835771560669, "step": 2386 }, { "epoch": 0.53, "learning_rate": 8.630529949975828e-06, "logits/chosen": -1.9717503786087036, "logits/rejected": -1.963651180267334, "logps/chosen": -117.3368148803711, "logps/rejected": -101.27584838867188, "loss": 0.6426, "rewards/accuracies": 1.0, "rewards/chosen": 5.044071197509766, "rewards/margins": 3.5976035594940186, "rewards/rejected": 1.446467638015747, "step": 2387 }, { "epoch": 0.53, "learning_rate": 8.629297342807528e-06, "logits/chosen": -1.2746806144714355, "logits/rejected": -1.242835521697998, "logps/chosen": -94.51663208007812, "logps/rejected": -62.38444900512695, "loss": 1.7263, "rewards/accuracies": 0.0, "rewards/chosen": 3.893293857574463, "rewards/margins": -2.174123764038086, "rewards/rejected": 6.067417621612549, "step": 2388 }, { "epoch": 0.53, "learning_rate": 8.628064269283246e-06, "logits/chosen": -1.3184256553649902, "logits/rejected": -1.262660026550293, "logps/chosen": -111.86018371582031, "logps/rejected": -71.80471801757812, "loss": 0.0738, "rewards/accuracies": 1.0, "rewards/chosen": 4.995663642883301, "rewards/margins": 1.844083547592163, "rewards/rejected": 3.1515800952911377, "step": 2389 }, { "epoch": 0.53, "learning_rate": 8.626830729561426e-06, "logits/chosen": -1.5845625400543213, "logits/rejected": -1.4298499822616577, "logps/chosen": -87.22769165039062, "logps/rejected": -25.640310287475586, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 4.039527893066406, "rewards/margins": 4.537283420562744, "rewards/rejected": -0.49775561690330505, "step": 2390 }, { "epoch": 0.53, "learning_rate": 8.625596723800575e-06, "logits/chosen": -1.4435158967971802, "logits/rejected": -1.4527863264083862, "logps/chosen": -52.96039581298828, "logps/rejected": -87.86798858642578, "loss": 0.7495, "rewards/accuracies": 0.0, "rewards/chosen": 2.5803658962249756, "rewards/margins": -1.002166748046875, "rewards/rejected": 3.5825326442718506, "step": 2391 }, { "epoch": 0.53, "learning_rate": 8.624362252159262e-06, "logits/chosen": -1.5762879848480225, "logits/rejected": -1.5066038370132446, "logps/chosen": -56.436973571777344, "logps/rejected": -17.692989349365234, "loss": 0.4498, "rewards/accuracies": 1.0, "rewards/chosen": 3.0974221229553223, "rewards/margins": 2.514171600341797, "rewards/rejected": 0.5832504630088806, "step": 2392 }, { "epoch": 0.53, "learning_rate": 8.623127314796111e-06, "logits/chosen": -1.5068901777267456, "logits/rejected": -1.491182565689087, "logps/chosen": -57.62202453613281, "logps/rejected": -49.637351989746094, "loss": 0.3753, "rewards/accuracies": 1.0, "rewards/chosen": 4.285459995269775, "rewards/margins": 0.3314995765686035, "rewards/rejected": 3.953960418701172, "step": 2393 }, { "epoch": 0.53, "learning_rate": 8.621891911869811e-06, "logits/chosen": -1.4219262599945068, "logits/rejected": -1.512502670288086, "logps/chosen": -64.99968719482422, "logps/rejected": -86.0142822265625, "loss": 0.8993, "rewards/accuracies": 0.0, "rewards/chosen": 6.025310516357422, "rewards/margins": -1.0975608825683594, "rewards/rejected": 7.122871398925781, "step": 2394 }, { "epoch": 0.53, "learning_rate": 8.620656043539106e-06, "logits/chosen": -1.296337604522705, "logits/rejected": -1.2699657678604126, "logps/chosen": -52.844757080078125, "logps/rejected": -54.711734771728516, "loss": 0.4256, "rewards/accuracies": 0.0, "rewards/chosen": 3.3651626110076904, "rewards/margins": -0.1838071346282959, "rewards/rejected": 3.5489697456359863, "step": 2395 }, { "epoch": 0.53, "learning_rate": 8.619419709962804e-06, "logits/chosen": -1.306537389755249, "logits/rejected": -1.1505430936813354, "logps/chosen": -103.95316314697266, "logps/rejected": -24.604284286499023, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 7.902828216552734, "rewards/margins": 6.988964557647705, "rewards/rejected": 0.9138635993003845, "step": 2396 }, { "epoch": 0.53, "learning_rate": 8.61818291129977e-06, "logits/chosen": -1.2726774215698242, "logits/rejected": -1.3231106996536255, "logps/chosen": -45.93638610839844, "logps/rejected": -107.70751953125, "loss": 0.875, "rewards/accuracies": 0.0, "rewards/chosen": 4.195435523986816, "rewards/margins": -0.756319522857666, "rewards/rejected": 4.951755046844482, "step": 2397 }, { "epoch": 0.53, "learning_rate": 8.61694564770893e-06, "logits/chosen": -1.1183372735977173, "logits/rejected": -1.0892308950424194, "logps/chosen": -77.343505859375, "logps/rejected": -69.2122573852539, "loss": 0.1596, "rewards/accuracies": 1.0, "rewards/chosen": 3.396493673324585, "rewards/margins": 1.0172951221466064, "rewards/rejected": 2.3791985511779785, "step": 2398 }, { "epoch": 0.53, "learning_rate": 8.61570791934927e-06, "logits/chosen": -1.6510818004608154, "logits/rejected": -1.6350661516189575, "logps/chosen": -61.49509048461914, "logps/rejected": -69.56169128417969, "loss": 2.2343, "rewards/accuracies": 0.0, "rewards/chosen": 2.803407669067383, "rewards/margins": -1.4589838981628418, "rewards/rejected": 4.262391567230225, "step": 2399 }, { "epoch": 0.53, "learning_rate": 8.614469726379833e-06, "logits/chosen": -1.6651926040649414, "logits/rejected": -1.6651926040649414, "logps/chosen": -26.776763916015625, "logps/rejected": -26.776763916015625, "loss": 0.614, "rewards/accuracies": 0.0, "rewards/chosen": 3.014261245727539, "rewards/margins": 0.0, "rewards/rejected": 3.014261245727539, "step": 2400 }, { "epoch": 0.53, "learning_rate": 8.613231068959726e-06, "logits/chosen": -1.3882758617401123, "logits/rejected": -1.3142695426940918, "logps/chosen": -40.77726364135742, "logps/rejected": -24.350902557373047, "loss": 6.104, "rewards/accuracies": 1.0, "rewards/chosen": 0.8041488528251648, "rewards/margins": 0.2845268249511719, "rewards/rejected": 0.5196220278739929, "step": 2401 }, { "epoch": 0.53, "learning_rate": 8.61199194724811e-06, "logits/chosen": -1.752966284751892, "logits/rejected": -1.7100306749343872, "logps/chosen": -138.69686889648438, "logps/rejected": -52.125038146972656, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 8.433578491210938, "rewards/margins": 6.231693267822266, "rewards/rejected": 2.201885223388672, "step": 2402 }, { "epoch": 0.53, "learning_rate": 8.610752361404216e-06, "logits/chosen": -1.5310826301574707, "logits/rejected": -1.4948574304580688, "logps/chosen": -48.68324279785156, "logps/rejected": -52.37355041503906, "loss": 0.511, "rewards/accuracies": 0.0, "rewards/chosen": 1.8404792547225952, "rewards/margins": -0.038420915603637695, "rewards/rejected": 1.878900170326233, "step": 2403 }, { "epoch": 0.53, "learning_rate": 8.60951231158732e-06, "logits/chosen": -1.3766217231750488, "logits/rejected": -1.3313038349151611, "logps/chosen": -42.14806365966797, "logps/rejected": -82.19316864013672, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": 2.55985426902771, "rewards/margins": 2.450834035873413, "rewards/rejected": 0.10902023315429688, "step": 2404 }, { "epoch": 0.53, "learning_rate": 8.60827179795677e-06, "logits/chosen": -1.5857057571411133, "logits/rejected": -1.5107866525650024, "logps/chosen": -173.06173706054688, "logps/rejected": -84.59677124023438, "loss": 0.9886, "rewards/accuracies": 0.0, "rewards/chosen": 4.909222602844238, "rewards/margins": -1.1850309371948242, "rewards/rejected": 6.0942535400390625, "step": 2405 }, { "epoch": 0.53, "learning_rate": 8.607030820671969e-06, "logits/chosen": -1.4075353145599365, "logits/rejected": -1.4350439310073853, "logps/chosen": -63.398223876953125, "logps/rejected": -115.65678405761719, "loss": 3.5904, "rewards/accuracies": 0.0, "rewards/chosen": 2.51450514793396, "rewards/margins": -4.8347625732421875, "rewards/rejected": 7.349267482757568, "step": 2406 }, { "epoch": 0.53, "learning_rate": 8.605789379892378e-06, "logits/chosen": -1.5540765523910522, "logits/rejected": -1.5327223539352417, "logps/chosen": -45.667572021484375, "logps/rejected": -54.203086853027344, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": 5.193789005279541, "rewards/margins": 2.7801926136016846, "rewards/rejected": 2.4135963916778564, "step": 2407 }, { "epoch": 0.53, "learning_rate": 8.60454747577752e-06, "logits/chosen": -1.604567527770996, "logits/rejected": -1.6314165592193604, "logps/chosen": -63.408203125, "logps/rejected": -92.28396606445312, "loss": 1.0017, "rewards/accuracies": 0.0, "rewards/chosen": 2.7636101245880127, "rewards/margins": -1.3793833255767822, "rewards/rejected": 4.142993450164795, "step": 2408 }, { "epoch": 0.53, "learning_rate": 8.603305108486975e-06, "logits/chosen": -1.332467794418335, "logits/rejected": -1.3308510780334473, "logps/chosen": -65.3120346069336, "logps/rejected": -62.86315155029297, "loss": 0.9004, "rewards/accuracies": 0.0, "rewards/chosen": 2.209829092025757, "rewards/margins": -1.3907866477966309, "rewards/rejected": 3.6006157398223877, "step": 2409 }, { "epoch": 0.53, "learning_rate": 8.602062278180388e-06, "logits/chosen": -1.517973780632019, "logits/rejected": -1.5211365222930908, "logps/chosen": -59.754798889160156, "logps/rejected": -58.0556640625, "loss": 1.3479, "rewards/accuracies": 0.0, "rewards/chosen": 3.03437876701355, "rewards/margins": -1.5860755443572998, "rewards/rejected": 4.62045431137085, "step": 2410 }, { "epoch": 0.53, "learning_rate": 8.600818985017457e-06, "logits/chosen": -1.729088306427002, "logits/rejected": -1.5918571949005127, "logps/chosen": -104.9358901977539, "logps/rejected": -16.409595489501953, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": 4.948409557342529, "rewards/margins": 3.4292564392089844, "rewards/rejected": 1.5191532373428345, "step": 2411 }, { "epoch": 0.53, "learning_rate": 8.59957522915794e-06, "logits/chosen": -1.217139482498169, "logits/rejected": -1.052175521850586, "logps/chosen": -41.435115814208984, "logps/rejected": -11.512960433959961, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": 3.8720600605010986, "rewards/margins": 3.007110834121704, "rewards/rejected": 0.8649492263793945, "step": 2412 }, { "epoch": 0.53, "learning_rate": 8.598331010761662e-06, "logits/chosen": -1.5024604797363281, "logits/rejected": -1.5065256357192993, "logps/chosen": -51.10346984863281, "logps/rejected": -72.87185668945312, "loss": 0.218, "rewards/accuracies": 1.0, "rewards/chosen": 3.482837677001953, "rewards/margins": 0.7663962841033936, "rewards/rejected": 2.7164413928985596, "step": 2413 }, { "epoch": 0.53, "learning_rate": 8.597086329988498e-06, "logits/chosen": -1.4226104021072388, "logits/rejected": -1.4255752563476562, "logps/chosen": -73.33686828613281, "logps/rejected": -74.44795989990234, "loss": 0.9643, "rewards/accuracies": 0.0, "rewards/chosen": 2.71820068359375, "rewards/margins": -1.6298470497131348, "rewards/rejected": 4.348047733306885, "step": 2414 }, { "epoch": 0.53, "learning_rate": 8.595841186998388e-06, "logits/chosen": -1.6044930219650269, "logits/rejected": -1.5691545009613037, "logps/chosen": -50.101444244384766, "logps/rejected": -55.12227249145508, "loss": 1.1314, "rewards/accuracies": 0.0, "rewards/chosen": 2.6839962005615234, "rewards/margins": -2.100677013397217, "rewards/rejected": 4.78467321395874, "step": 2415 }, { "epoch": 0.53, "learning_rate": 8.594595581951329e-06, "logits/chosen": -1.7976551055908203, "logits/rejected": -1.7976551055908203, "logps/chosen": -103.68852233886719, "logps/rejected": -103.68852233886719, "loss": 0.3469, "rewards/accuracies": 0.0, "rewards/chosen": 3.964643955230713, "rewards/margins": 0.0, "rewards/rejected": 3.964643955230713, "step": 2416 }, { "epoch": 0.53, "learning_rate": 8.593349515007379e-06, "logits/chosen": -1.3774336576461792, "logits/rejected": -1.3774336576461792, "logps/chosen": -56.22765350341797, "logps/rejected": -56.22765350341797, "loss": 0.3548, "rewards/accuracies": 0.0, "rewards/chosen": 5.249275207519531, "rewards/margins": 0.0, "rewards/rejected": 5.249275207519531, "step": 2417 }, { "epoch": 0.54, "learning_rate": 8.592102986326656e-06, "logits/chosen": -1.6368452310562134, "logits/rejected": -1.605434775352478, "logps/chosen": -116.14938354492188, "logps/rejected": -115.38165283203125, "loss": 1.1282, "rewards/accuracies": 0.0, "rewards/chosen": 6.853170871734619, "rewards/margins": -1.029252529144287, "rewards/rejected": 7.882423400878906, "step": 2418 }, { "epoch": 0.54, "learning_rate": 8.590855996069334e-06, "logits/chosen": -1.8489227294921875, "logits/rejected": -1.8663564920425415, "logps/chosen": -46.95259094238281, "logps/rejected": -71.44319152832031, "loss": 2.098, "rewards/accuracies": 0.0, "rewards/chosen": 2.2406463623046875, "rewards/margins": -4.096975803375244, "rewards/rejected": 6.337622165679932, "step": 2419 }, { "epoch": 0.54, "learning_rate": 8.589608544395646e-06, "logits/chosen": -2.3569326400756836, "logits/rejected": -2.3952322006225586, "logps/chosen": -145.89364624023438, "logps/rejected": -133.228515625, "loss": 0.9741, "rewards/accuracies": 0.0, "rewards/chosen": 6.69807767868042, "rewards/margins": -1.7899470329284668, "rewards/rejected": 8.488024711608887, "step": 2420 }, { "epoch": 0.54, "learning_rate": 8.588360631465893e-06, "logits/chosen": -1.5887700319290161, "logits/rejected": -1.5887700319290161, "logps/chosen": -31.490224838256836, "logps/rejected": -31.490224838256836, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 3.9885666370391846, "rewards/margins": 0.0, "rewards/rejected": 3.9885666370391846, "step": 2421 }, { "epoch": 0.54, "learning_rate": 8.587112257440422e-06, "logits/chosen": -1.5282423496246338, "logits/rejected": -1.4955472946166992, "logps/chosen": -59.810298919677734, "logps/rejected": -48.133033752441406, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 3.6878247261047363, "rewards/margins": 0.23577165603637695, "rewards/rejected": 3.4520530700683594, "step": 2422 }, { "epoch": 0.54, "learning_rate": 8.585863422479652e-06, "logits/chosen": -1.5994573831558228, "logits/rejected": -1.2868719100952148, "logps/chosen": -125.28823852539062, "logps/rejected": -34.135406494140625, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 5.3915863037109375, "rewards/margins": 5.229460716247559, "rewards/rejected": 0.16212578117847443, "step": 2423 }, { "epoch": 0.54, "learning_rate": 8.584614126744051e-06, "logits/chosen": -1.6283378601074219, "logits/rejected": -1.5800933837890625, "logps/chosen": -51.444061279296875, "logps/rejected": -21.005142211914062, "loss": 0.7135, "rewards/accuracies": 1.0, "rewards/chosen": 4.002488613128662, "rewards/margins": 1.851886510848999, "rewards/rejected": 2.150602102279663, "step": 2424 }, { "epoch": 0.54, "learning_rate": 8.583364370394152e-06, "logits/chosen": -1.8373539447784424, "logits/rejected": -1.7777141332626343, "logps/chosen": -62.46669387817383, "logps/rejected": -68.15107727050781, "loss": 0.9063, "rewards/accuracies": 0.0, "rewards/chosen": 2.6665942668914795, "rewards/margins": -0.7859373092651367, "rewards/rejected": 3.452531576156616, "step": 2425 }, { "epoch": 0.54, "learning_rate": 8.582114153590543e-06, "logits/chosen": -1.4643861055374146, "logits/rejected": -1.3335381746292114, "logps/chosen": -97.828125, "logps/rejected": -59.428226470947266, "loss": 0.3088, "rewards/accuracies": 1.0, "rewards/chosen": 6.141720771789551, "rewards/margins": 0.32869672775268555, "rewards/rejected": 5.813024044036865, "step": 2426 }, { "epoch": 0.54, "learning_rate": 8.58086347649388e-06, "logits/chosen": -1.46576726436615, "logits/rejected": -1.3108394145965576, "logps/chosen": -159.15757751464844, "logps/rejected": -130.1563262939453, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": 9.268266677856445, "rewards/margins": 3.525278091430664, "rewards/rejected": 5.742988586425781, "step": 2427 }, { "epoch": 0.54, "learning_rate": 8.579612339264867e-06, "logits/chosen": -1.4900330305099487, "logits/rejected": -1.4900330305099487, "logps/chosen": -27.678056716918945, "logps/rejected": -27.678056716918945, "loss": 0.4051, "rewards/accuracies": 0.0, "rewards/chosen": 1.038953185081482, "rewards/margins": 0.0, "rewards/rejected": 1.038953185081482, "step": 2428 }, { "epoch": 0.54, "learning_rate": 8.578360742064274e-06, "logits/chosen": -1.9549509286880493, "logits/rejected": -1.9458242654800415, "logps/chosen": -64.4159927368164, "logps/rejected": -76.9303970336914, "loss": 1.0487, "rewards/accuracies": 0.0, "rewards/chosen": 4.463997840881348, "rewards/margins": -0.5329046249389648, "rewards/rejected": 4.9969024658203125, "step": 2429 }, { "epoch": 0.54, "learning_rate": 8.577108685052927e-06, "logits/chosen": -1.341529130935669, "logits/rejected": -1.3721108436584473, "logps/chosen": -43.49320983886719, "logps/rejected": -32.987789154052734, "loss": 1.485, "rewards/accuracies": 1.0, "rewards/chosen": 1.350568413734436, "rewards/margins": 0.08798563480377197, "rewards/rejected": 1.262582778930664, "step": 2430 }, { "epoch": 0.54, "learning_rate": 8.575856168391714e-06, "logits/chosen": -1.2470409870147705, "logits/rejected": -1.312511682510376, "logps/chosen": -77.45343017578125, "logps/rejected": -109.60427856445312, "loss": 0.9926, "rewards/accuracies": 0.0, "rewards/chosen": 1.8472137451171875, "rewards/margins": -1.7760834693908691, "rewards/rejected": 3.6232972145080566, "step": 2431 }, { "epoch": 0.54, "learning_rate": 8.57460319224158e-06, "logits/chosen": -1.5566153526306152, "logits/rejected": -1.5099537372589111, "logps/chosen": -58.700504302978516, "logps/rejected": -56.735939025878906, "loss": 0.514, "rewards/accuracies": 1.0, "rewards/chosen": 3.91156268119812, "rewards/margins": 0.6326770782470703, "rewards/rejected": 3.27888560295105, "step": 2432 }, { "epoch": 0.54, "learning_rate": 8.573349756763527e-06, "logits/chosen": -1.3633582592010498, "logits/rejected": -1.3633582592010498, "logps/chosen": -58.54766082763672, "logps/rejected": -58.54766082763672, "loss": 1.696, "rewards/accuracies": 0.0, "rewards/chosen": 3.8741097450256348, "rewards/margins": 0.0, "rewards/rejected": 3.8741097450256348, "step": 2433 }, { "epoch": 0.54, "learning_rate": 8.572095862118621e-06, "logits/chosen": -1.5682884454727173, "logits/rejected": -1.5145440101623535, "logps/chosen": -53.42626953125, "logps/rejected": -84.76780700683594, "loss": 0.9988, "rewards/accuracies": 0.0, "rewards/chosen": 3.544081926345825, "rewards/margins": -1.816469430923462, "rewards/rejected": 5.360551357269287, "step": 2434 }, { "epoch": 0.54, "learning_rate": 8.570841508467984e-06, "logits/chosen": -1.756633996963501, "logits/rejected": -1.756633996963501, "logps/chosen": -51.51105880737305, "logps/rejected": -51.51105880737305, "loss": 0.7267, "rewards/accuracies": 0.0, "rewards/chosen": 2.81805682182312, "rewards/margins": 0.0, "rewards/rejected": 2.81805682182312, "step": 2435 }, { "epoch": 0.54, "learning_rate": 8.569586695972798e-06, "logits/chosen": -1.3054249286651611, "logits/rejected": -1.231909155845642, "logps/chosen": -55.367008209228516, "logps/rejected": -62.4107551574707, "loss": 0.5535, "rewards/accuracies": 0.0, "rewards/chosen": 5.2980265617370605, "rewards/margins": -0.43279266357421875, "rewards/rejected": 5.730819225311279, "step": 2436 }, { "epoch": 0.54, "learning_rate": 8.568331424794301e-06, "logits/chosen": -1.2721471786499023, "logits/rejected": -1.2721471786499023, "logps/chosen": -32.57601547241211, "logps/rejected": -32.57601547241211, "loss": 0.6623, "rewards/accuracies": 0.0, "rewards/chosen": 2.517319440841675, "rewards/margins": 0.0, "rewards/rejected": 2.517319440841675, "step": 2437 }, { "epoch": 0.54, "learning_rate": 8.567075695093796e-06, "logits/chosen": -1.6053975820541382, "logits/rejected": -1.6053975820541382, "logps/chosen": -34.24781799316406, "logps/rejected": -34.24781799316406, "loss": 0.5793, "rewards/accuracies": 0.0, "rewards/chosen": 1.4691845178604126, "rewards/margins": 0.0, "rewards/rejected": 1.4691845178604126, "step": 2438 }, { "epoch": 0.54, "learning_rate": 8.565819507032637e-06, "logits/chosen": -1.82721745967865, "logits/rejected": -1.6381092071533203, "logps/chosen": -104.7711181640625, "logps/rejected": -67.15141296386719, "loss": 0.1473, "rewards/accuracies": 1.0, "rewards/chosen": 7.296853542327881, "rewards/margins": 4.430972099304199, "rewards/rejected": 2.8658814430236816, "step": 2439 }, { "epoch": 0.54, "learning_rate": 8.564562860772246e-06, "logits/chosen": -1.6086673736572266, "logits/rejected": -1.7282438278198242, "logps/chosen": -44.2054443359375, "logps/rejected": -90.36598205566406, "loss": 3.6037, "rewards/accuracies": 0.0, "rewards/chosen": 3.841059923171997, "rewards/margins": -6.1589813232421875, "rewards/rejected": 10.000041007995605, "step": 2440 }, { "epoch": 0.54, "learning_rate": 8.563305756474094e-06, "logits/chosen": -1.6321827173233032, "logits/rejected": -1.472137212753296, "logps/chosen": -57.95945358276367, "logps/rejected": -53.19849395751953, "loss": 0.3843, "rewards/accuracies": 1.0, "rewards/chosen": 5.54233980178833, "rewards/margins": 2.562749147415161, "rewards/rejected": 2.979590654373169, "step": 2441 }, { "epoch": 0.54, "learning_rate": 8.562048194299719e-06, "logits/chosen": -1.383965253829956, "logits/rejected": -1.3934049606323242, "logps/chosen": -89.6439208984375, "logps/rejected": -75.62690734863281, "loss": 0.1721, "rewards/accuracies": 1.0, "rewards/chosen": 3.3227150440216064, "rewards/margins": 0.947573184967041, "rewards/rejected": 2.3751418590545654, "step": 2442 }, { "epoch": 0.54, "learning_rate": 8.560790174410713e-06, "logits/chosen": -1.6070865392684937, "logits/rejected": -1.6070865392684937, "logps/chosen": -33.021697998046875, "logps/rejected": -33.021697998046875, "loss": 0.3917, "rewards/accuracies": 0.0, "rewards/chosen": 1.1619857549667358, "rewards/margins": 0.0, "rewards/rejected": 1.1619857549667358, "step": 2443 }, { "epoch": 0.54, "learning_rate": 8.559531696968733e-06, "logits/chosen": -1.60871422290802, "logits/rejected": -1.5886895656585693, "logps/chosen": -45.065757751464844, "logps/rejected": -59.222251892089844, "loss": 0.4119, "rewards/accuracies": 0.0, "rewards/chosen": 2.6077988147735596, "rewards/margins": -0.19008946418762207, "rewards/rejected": 2.7978882789611816, "step": 2444 }, { "epoch": 0.54, "learning_rate": 8.558272762135483e-06, "logits/chosen": -1.6749112606048584, "logits/rejected": -1.6130717992782593, "logps/chosen": -106.69255065917969, "logps/rejected": -113.68254089355469, "loss": 0.4019, "rewards/accuracies": 0.0, "rewards/chosen": 6.671870708465576, "rewards/margins": -0.20563030242919922, "rewards/rejected": 6.877501010894775, "step": 2445 }, { "epoch": 0.54, "learning_rate": 8.557013370072737e-06, "logits/chosen": -1.4256683588027954, "logits/rejected": -1.468524694442749, "logps/chosen": -60.1423225402832, "logps/rejected": -118.11213684082031, "loss": 0.1024, "rewards/accuracies": 1.0, "rewards/chosen": 5.967221736907959, "rewards/margins": 1.829679012298584, "rewards/rejected": 4.137542724609375, "step": 2446 }, { "epoch": 0.54, "learning_rate": 8.555753520942327e-06, "logits/chosen": -1.6374409198760986, "logits/rejected": -1.3578486442565918, "logps/chosen": -165.55516052246094, "logps/rejected": -58.05943298339844, "loss": 0.4467, "rewards/accuracies": 1.0, "rewards/chosen": 7.139634609222412, "rewards/margins": 3.223161220550537, "rewards/rejected": 3.916473388671875, "step": 2447 }, { "epoch": 0.54, "learning_rate": 8.554493214906135e-06, "logits/chosen": -1.7382642030715942, "logits/rejected": -1.5163873434066772, "logps/chosen": -148.88192749023438, "logps/rejected": -43.10023498535156, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 8.073405265808105, "rewards/margins": 5.708759784698486, "rewards/rejected": 2.364645481109619, "step": 2448 }, { "epoch": 0.54, "learning_rate": 8.55323245212611e-06, "logits/chosen": -1.6147499084472656, "logits/rejected": -1.6022356748580933, "logps/chosen": -104.65087127685547, "logps/rejected": -58.293418884277344, "loss": 0.466, "rewards/accuracies": 0.0, "rewards/chosen": 6.1786675453186035, "rewards/margins": -0.4257516860961914, "rewards/rejected": 6.604419231414795, "step": 2449 }, { "epoch": 0.54, "learning_rate": 8.551971232764255e-06, "logits/chosen": -1.6811244487762451, "logits/rejected": -1.6581156253814697, "logps/chosen": -76.8004150390625, "logps/rejected": -71.54527282714844, "loss": 0.3131, "rewards/accuracies": 1.0, "rewards/chosen": 7.031613349914551, "rewards/margins": 3.848336935043335, "rewards/rejected": 3.183276414871216, "step": 2450 }, { "epoch": 0.54, "learning_rate": 8.550709556982637e-06, "logits/chosen": -1.6011950969696045, "logits/rejected": -1.5803537368774414, "logps/chosen": -25.700559616088867, "logps/rejected": -33.12446594238281, "loss": 1.2144, "rewards/accuracies": 1.0, "rewards/chosen": 2.255795955657959, "rewards/margins": 0.34371626377105713, "rewards/rejected": 1.9120796918869019, "step": 2451 }, { "epoch": 0.54, "learning_rate": 8.549447424943379e-06, "logits/chosen": -1.422526240348816, "logits/rejected": -1.3206787109375, "logps/chosen": -68.51811218261719, "logps/rejected": -52.07566833496094, "loss": 0.0639, "rewards/accuracies": 1.0, "rewards/chosen": 3.362459659576416, "rewards/margins": 2.2624993324279785, "rewards/rejected": 1.0999603271484375, "step": 2452 }, { "epoch": 0.54, "learning_rate": 8.548184836808657e-06, "logits/chosen": -1.3098046779632568, "logits/rejected": -1.3060259819030762, "logps/chosen": -6.877993583679199, "logps/rejected": -23.34059715270996, "loss": 0.6471, "rewards/accuracies": 1.0, "rewards/chosen": 2.103893995285034, "rewards/margins": 0.19360172748565674, "rewards/rejected": 1.9102922677993774, "step": 2453 }, { "epoch": 0.54, "learning_rate": 8.546921792740712e-06, "logits/chosen": -1.524327039718628, "logits/rejected": -1.460715889930725, "logps/chosen": -35.418663024902344, "logps/rejected": -89.34861755371094, "loss": 0.6986, "rewards/accuracies": 1.0, "rewards/chosen": 3.1228928565979004, "rewards/margins": 0.7421844005584717, "rewards/rejected": 2.3807084560394287, "step": 2454 }, { "epoch": 0.54, "learning_rate": 8.545658292901844e-06, "logits/chosen": -1.3679791688919067, "logits/rejected": -1.3172547817230225, "logps/chosen": -50.27425003051758, "logps/rejected": -54.554908752441406, "loss": 0.3436, "rewards/accuracies": 1.0, "rewards/chosen": 3.3573575019836426, "rewards/margins": 0.325913667678833, "rewards/rejected": 3.0314438343048096, "step": 2455 }, { "epoch": 0.54, "learning_rate": 8.544394337454409e-06, "logits/chosen": -1.4782377481460571, "logits/rejected": -1.466589331626892, "logps/chosen": -33.360801696777344, "logps/rejected": -63.937034606933594, "loss": 1.2445, "rewards/accuracies": 0.0, "rewards/chosen": 1.9915584325790405, "rewards/margins": -1.9246593713760376, "rewards/rejected": 3.916217803955078, "step": 2456 }, { "epoch": 0.54, "learning_rate": 8.543129926560822e-06, "logits/chosen": -1.375592827796936, "logits/rejected": -1.396915078163147, "logps/chosen": -47.15008544921875, "logps/rejected": -53.13241958618164, "loss": 0.821, "rewards/accuracies": 0.0, "rewards/chosen": 1.9844391345977783, "rewards/margins": -0.7406444549560547, "rewards/rejected": 2.725083589553833, "step": 2457 }, { "epoch": 0.54, "learning_rate": 8.541865060383559e-06, "logits/chosen": -1.6770341396331787, "logits/rejected": -1.669237732887268, "logps/chosen": -78.25788879394531, "logps/rejected": -78.11784362792969, "loss": 1.1837, "rewards/accuracies": 0.0, "rewards/chosen": 4.366837501525879, "rewards/margins": -2.1615681648254395, "rewards/rejected": 6.528405666351318, "step": 2458 }, { "epoch": 0.54, "learning_rate": 8.540599739085147e-06, "logits/chosen": -1.7349588871002197, "logits/rejected": -1.703506350517273, "logps/chosen": -102.96430969238281, "logps/rejected": -153.62371826171875, "loss": 1.2707, "rewards/accuracies": 0.0, "rewards/chosen": 6.3196868896484375, "rewards/margins": -0.7938203811645508, "rewards/rejected": 7.113507270812988, "step": 2459 }, { "epoch": 0.54, "learning_rate": 8.539333962828182e-06, "logits/chosen": -1.5642390251159668, "logits/rejected": -1.609544038772583, "logps/chosen": -26.220590591430664, "logps/rejected": -33.2767333984375, "loss": 0.6449, "rewards/accuracies": 0.0, "rewards/chosen": 2.2456929683685303, "rewards/margins": -0.8991427421569824, "rewards/rejected": 3.1448357105255127, "step": 2460 }, { "epoch": 0.54, "learning_rate": 8.53806773177531e-06, "logits/chosen": -1.5280694961547852, "logits/rejected": -1.5306031703948975, "logps/chosen": -39.744842529296875, "logps/rejected": -46.270591735839844, "loss": 1.0839, "rewards/accuracies": 0.0, "rewards/chosen": 3.052544355392456, "rewards/margins": -0.23558974266052246, "rewards/rejected": 3.2881340980529785, "step": 2461 }, { "epoch": 0.54, "learning_rate": 8.53680104608924e-06, "logits/chosen": -1.6059855222702026, "logits/rejected": -1.4515146017074585, "logps/chosen": -84.61662292480469, "logps/rejected": -12.838933944702148, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": 3.395956516265869, "rewards/margins": 2.8760149478912354, "rewards/rejected": 0.519941508769989, "step": 2462 }, { "epoch": 0.55, "learning_rate": 8.535533905932739e-06, "logits/chosen": -1.4899979829788208, "logits/rejected": -1.4652671813964844, "logps/chosen": -184.58963012695312, "logps/rejected": -110.26763153076172, "loss": 0.5531, "rewards/accuracies": 1.0, "rewards/chosen": 7.506552219390869, "rewards/margins": 4.341765403747559, "rewards/rejected": 3.1647865772247314, "step": 2463 }, { "epoch": 0.55, "learning_rate": 8.534266311468629e-06, "logits/chosen": -1.6230159997940063, "logits/rejected": -1.5904370546340942, "logps/chosen": -79.55917358398438, "logps/rejected": -61.42163848876953, "loss": 1.0759, "rewards/accuracies": 0.0, "rewards/chosen": 5.307873725891113, "rewards/margins": -1.959984302520752, "rewards/rejected": 7.267858028411865, "step": 2464 }, { "epoch": 0.55, "learning_rate": 8.532998262859794e-06, "logits/chosen": -1.4195894002914429, "logits/rejected": -1.3955745697021484, "logps/chosen": -41.831111907958984, "logps/rejected": -44.54917907714844, "loss": 0.8116, "rewards/accuracies": 0.0, "rewards/chosen": 3.3581302165985107, "rewards/margins": -1.3970515727996826, "rewards/rejected": 4.755181789398193, "step": 2465 }, { "epoch": 0.55, "learning_rate": 8.531729760269176e-06, "logits/chosen": -1.6941895484924316, "logits/rejected": -1.7263942956924438, "logps/chosen": -47.714881896972656, "logps/rejected": -50.890769958496094, "loss": 0.3338, "rewards/accuracies": 1.0, "rewards/chosen": 3.0778510570526123, "rewards/margins": 0.7669668197631836, "rewards/rejected": 2.3108842372894287, "step": 2466 }, { "epoch": 0.55, "learning_rate": 8.530460803859772e-06, "logits/chosen": -1.6307063102722168, "logits/rejected": -1.6445536613464355, "logps/chosen": -84.86424255371094, "logps/rejected": -61.02760314941406, "loss": 0.3734, "rewards/accuracies": 1.0, "rewards/chosen": 3.3979644775390625, "rewards/margins": 1.1715514659881592, "rewards/rejected": 2.2264130115509033, "step": 2467 }, { "epoch": 0.55, "learning_rate": 8.529191393794645e-06, "logits/chosen": -1.58750319480896, "logits/rejected": -1.5216333866119385, "logps/chosen": -31.506757736206055, "logps/rejected": -83.2761001586914, "loss": 2.6373, "rewards/accuracies": 1.0, "rewards/chosen": 4.69403076171875, "rewards/margins": 2.1476738452911377, "rewards/rejected": 2.5463569164276123, "step": 2468 }, { "epoch": 0.55, "learning_rate": 8.527921530236905e-06, "logits/chosen": -1.5489650964736938, "logits/rejected": -1.5418587923049927, "logps/chosen": -45.18670654296875, "logps/rejected": -40.127464294433594, "loss": 0.7137, "rewards/accuracies": 0.0, "rewards/chosen": 3.1858718395233154, "rewards/margins": -0.41397714614868164, "rewards/rejected": 3.599848985671997, "step": 2469 }, { "epoch": 0.55, "learning_rate": 8.52665121334973e-06, "logits/chosen": -1.540295124053955, "logits/rejected": -1.6092392206192017, "logps/chosen": -52.511844635009766, "logps/rejected": -90.91893005371094, "loss": 2.0515, "rewards/accuracies": 0.0, "rewards/chosen": 1.8014522790908813, "rewards/margins": -4.062082290649414, "rewards/rejected": 5.863534450531006, "step": 2470 }, { "epoch": 0.55, "learning_rate": 8.525380443296353e-06, "logits/chosen": -1.432675838470459, "logits/rejected": -1.432675838470459, "logps/chosen": -25.071077346801758, "logps/rejected": -25.071077346801758, "loss": 0.3502, "rewards/accuracies": 0.0, "rewards/chosen": 1.723282814025879, "rewards/margins": 0.0, "rewards/rejected": 1.723282814025879, "step": 2471 }, { "epoch": 0.55, "learning_rate": 8.524109220240064e-06, "logits/chosen": -1.789231538772583, "logits/rejected": -1.7333886623382568, "logps/chosen": -67.87391662597656, "logps/rejected": -58.03651428222656, "loss": 0.2837, "rewards/accuracies": 1.0, "rewards/chosen": 3.492704153060913, "rewards/margins": 0.41051411628723145, "rewards/rejected": 3.0821900367736816, "step": 2472 }, { "epoch": 0.55, "learning_rate": 8.52283754434421e-06, "logits/chosen": -1.496092438697815, "logits/rejected": -1.4721457958221436, "logps/chosen": -49.91654968261719, "logps/rejected": -61.91826629638672, "loss": 0.9801, "rewards/accuracies": 0.0, "rewards/chosen": 2.832362413406372, "rewards/margins": -1.751279592514038, "rewards/rejected": 4.58364200592041, "step": 2473 }, { "epoch": 0.55, "learning_rate": 8.521565415772201e-06, "logits/chosen": -1.6313010454177856, "logits/rejected": -1.5050641298294067, "logps/chosen": -48.83891296386719, "logps/rejected": -23.890270233154297, "loss": 0.7236, "rewards/accuracies": 1.0, "rewards/chosen": 3.861457109451294, "rewards/margins": 4.218318939208984, "rewards/rejected": -0.3568618893623352, "step": 2474 }, { "epoch": 0.55, "learning_rate": 8.520292834687503e-06, "logits/chosen": -1.6842470169067383, "logits/rejected": -1.5769842863082886, "logps/chosen": -68.23320007324219, "logps/rejected": -49.82929992675781, "loss": 0.6314, "rewards/accuracies": 0.0, "rewards/chosen": 2.2973480224609375, "rewards/margins": -0.7997627258300781, "rewards/rejected": 3.0971107482910156, "step": 2475 }, { "epoch": 0.55, "learning_rate": 8.519019801253637e-06, "logits/chosen": -1.3385612964630127, "logits/rejected": -1.3385612964630127, "logps/chosen": -26.124340057373047, "logps/rejected": -26.124340057373047, "loss": 0.5077, "rewards/accuracies": 0.0, "rewards/chosen": 2.1048645973205566, "rewards/margins": 0.0, "rewards/rejected": 2.1048645973205566, "step": 2476 }, { "epoch": 0.55, "learning_rate": 8.517746315634186e-06, "logits/chosen": -1.5322331190109253, "logits/rejected": -1.542256236076355, "logps/chosen": -25.10657501220703, "logps/rejected": -37.63894271850586, "loss": 0.8842, "rewards/accuracies": 1.0, "rewards/chosen": 2.7336766719818115, "rewards/margins": 0.6284999847412109, "rewards/rejected": 2.1051766872406006, "step": 2477 }, { "epoch": 0.55, "learning_rate": 8.51647237799279e-06, "logits/chosen": -1.7067240476608276, "logits/rejected": -1.566739797592163, "logps/chosen": -53.00598907470703, "logps/rejected": -16.4644775390625, "loss": 0.3365, "rewards/accuracies": 1.0, "rewards/chosen": 3.6383156776428223, "rewards/margins": 3.3682479858398438, "rewards/rejected": 0.27006760239601135, "step": 2478 }, { "epoch": 0.55, "learning_rate": 8.515197988493146e-06, "logits/chosen": -1.5532654523849487, "logits/rejected": -1.5311357975006104, "logps/chosen": -42.004150390625, "logps/rejected": -18.98661231994629, "loss": 0.2, "rewards/accuracies": 1.0, "rewards/chosen": 3.0038001537323, "rewards/margins": 1.7548803091049194, "rewards/rejected": 1.2489198446273804, "step": 2479 }, { "epoch": 0.55, "learning_rate": 8.513923147299012e-06, "logits/chosen": -1.9427869319915771, "logits/rejected": -1.8820202350616455, "logps/chosen": -84.5825424194336, "logps/rejected": -125.14637756347656, "loss": 0.4396, "rewards/accuracies": 0.0, "rewards/chosen": 8.192673683166504, "rewards/margins": -0.2314138412475586, "rewards/rejected": 8.424087524414062, "step": 2480 }, { "epoch": 0.55, "learning_rate": 8.512647854574201e-06, "logits/chosen": -1.4954533576965332, "logits/rejected": -1.3860867023468018, "logps/chosen": -91.27923583984375, "logps/rejected": -63.04255676269531, "loss": 1.0431, "rewards/accuracies": 1.0, "rewards/chosen": 5.10946798324585, "rewards/margins": 1.020493984222412, "rewards/rejected": 4.0889739990234375, "step": 2481 }, { "epoch": 0.55, "learning_rate": 8.511372110482583e-06, "logits/chosen": -1.533241868019104, "logits/rejected": -1.5048892498016357, "logps/chosen": -115.18682861328125, "logps/rejected": -88.65026092529297, "loss": 0.5325, "rewards/accuracies": 0.0, "rewards/chosen": 6.421763896942139, "rewards/margins": -0.6340384483337402, "rewards/rejected": 7.055802345275879, "step": 2482 }, { "epoch": 0.55, "learning_rate": 8.510095915188093e-06, "logits/chosen": -1.7715668678283691, "logits/rejected": -1.6580549478530884, "logps/chosen": -80.39491271972656, "logps/rejected": -59.237548828125, "loss": 0.3226, "rewards/accuracies": 1.0, "rewards/chosen": 6.137075901031494, "rewards/margins": 0.16873407363891602, "rewards/rejected": 5.968341827392578, "step": 2483 }, { "epoch": 0.55, "learning_rate": 8.508819268854713e-06, "logits/chosen": -1.6814920902252197, "logits/rejected": -1.6357430219650269, "logps/chosen": -54.82030487060547, "logps/rejected": -38.900245666503906, "loss": 0.3174, "rewards/accuracies": 1.0, "rewards/chosen": 2.73103404045105, "rewards/margins": 0.1867201328277588, "rewards/rejected": 2.544313907623291, "step": 2484 }, { "epoch": 0.55, "learning_rate": 8.507542171646493e-06, "logits/chosen": -1.791929006576538, "logits/rejected": -1.7114826440811157, "logps/chosen": -51.62297439575195, "logps/rejected": -73.08786010742188, "loss": 0.4955, "rewards/accuracies": 1.0, "rewards/chosen": 3.667631149291992, "rewards/margins": 1.6059291362762451, "rewards/rejected": 2.061702013015747, "step": 2485 }, { "epoch": 0.55, "learning_rate": 8.506264623727536e-06, "logits/chosen": -1.739949107170105, "logits/rejected": -1.739949107170105, "logps/chosen": -13.183420181274414, "logps/rejected": -13.183420181274414, "loss": 0.8913, "rewards/accuracies": 0.0, "rewards/chosen": 1.318855881690979, "rewards/margins": 0.0, "rewards/rejected": 1.318855881690979, "step": 2486 }, { "epoch": 0.55, "learning_rate": 8.504986625262004e-06, "logits/chosen": -1.313338279724121, "logits/rejected": -1.3104383945465088, "logps/chosen": -93.1657943725586, "logps/rejected": -55.40854263305664, "loss": 1.4812, "rewards/accuracies": 0.0, "rewards/chosen": 1.2651894092559814, "rewards/margins": -2.5791239738464355, "rewards/rejected": 3.844313383102417, "step": 2487 }, { "epoch": 0.55, "learning_rate": 8.503708176414115e-06, "logits/chosen": -1.7274786233901978, "logits/rejected": -1.6459592580795288, "logps/chosen": -107.787109375, "logps/rejected": -79.16006469726562, "loss": 0.5341, "rewards/accuracies": 0.0, "rewards/chosen": 7.565722942352295, "rewards/margins": -0.6464829444885254, "rewards/rejected": 8.21220588684082, "step": 2488 }, { "epoch": 0.55, "learning_rate": 8.50242927734815e-06, "logits/chosen": -1.6577283143997192, "logits/rejected": -1.5881119966506958, "logps/chosen": -50.386932373046875, "logps/rejected": -46.61463165283203, "loss": 1.265, "rewards/accuracies": 0.0, "rewards/chosen": 2.6612954139709473, "rewards/margins": -2.2530298233032227, "rewards/rejected": 4.91432523727417, "step": 2489 }, { "epoch": 0.55, "learning_rate": 8.501149928228441e-06, "logits/chosen": -1.443913459777832, "logits/rejected": -1.4888559579849243, "logps/chosen": -42.682769775390625, "logps/rejected": -122.79978942871094, "loss": 1.3774, "rewards/accuracies": 0.0, "rewards/chosen": 3.125837802886963, "rewards/margins": -2.276150703430176, "rewards/rejected": 5.401988506317139, "step": 2490 }, { "epoch": 0.55, "learning_rate": 8.499870129219383e-06, "logits/chosen": -1.4171361923217773, "logits/rejected": -1.4171361923217773, "logps/chosen": -60.425785064697266, "logps/rejected": -60.425785064697266, "loss": 0.3655, "rewards/accuracies": 0.0, "rewards/chosen": 4.343291759490967, "rewards/margins": 0.0, "rewards/rejected": 4.343291759490967, "step": 2491 }, { "epoch": 0.55, "learning_rate": 8.498589880485428e-06, "logits/chosen": -1.4437601566314697, "logits/rejected": -1.3841396570205688, "logps/chosen": -149.3321533203125, "logps/rejected": -87.66719818115234, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 7.861340522766113, "rewards/margins": 4.073200225830078, "rewards/rejected": 3.788140058517456, "step": 2492 }, { "epoch": 0.55, "learning_rate": 8.497309182191082e-06, "logits/chosen": -1.5205731391906738, "logits/rejected": -1.5269758701324463, "logps/chosen": -32.66472244262695, "logps/rejected": -149.87326049804688, "loss": 2.0348, "rewards/accuracies": 0.0, "rewards/chosen": 3.664693832397461, "rewards/margins": -3.4083456993103027, "rewards/rejected": 7.073039531707764, "step": 2493 }, { "epoch": 0.55, "learning_rate": 8.496028034500914e-06, "logits/chosen": -1.535526156425476, "logits/rejected": -1.4975906610488892, "logps/chosen": -40.02964401245117, "logps/rejected": -48.280242919921875, "loss": 0.2869, "rewards/accuracies": 1.0, "rewards/chosen": 2.4313130378723145, "rewards/margins": 0.27074241638183594, "rewards/rejected": 2.1605706214904785, "step": 2494 }, { "epoch": 0.55, "learning_rate": 8.49474643757955e-06, "logits/chosen": -1.4029101133346558, "logits/rejected": -1.4511675834655762, "logps/chosen": -63.543861389160156, "logps/rejected": -100.98951721191406, "loss": 3.3356, "rewards/accuracies": 0.0, "rewards/chosen": 6.489418983459473, "rewards/margins": -3.353400230407715, "rewards/rejected": 9.842819213867188, "step": 2495 }, { "epoch": 0.55, "learning_rate": 8.493464391591665e-06, "logits/chosen": -1.2693804502487183, "logits/rejected": -1.1483502388000488, "logps/chosen": -83.26596069335938, "logps/rejected": -12.12924861907959, "loss": 0.2106, "rewards/accuracies": 1.0, "rewards/chosen": 2.884347677230835, "rewards/margins": 2.2181999683380127, "rewards/rejected": 0.6661477088928223, "step": 2496 }, { "epoch": 0.55, "learning_rate": 8.492181896702008e-06, "logits/chosen": -1.4039725065231323, "logits/rejected": -1.3186092376708984, "logps/chosen": -50.62785339355469, "logps/rejected": -51.38182830810547, "loss": 1.0357, "rewards/accuracies": 0.0, "rewards/chosen": 1.5953186750411987, "rewards/margins": -1.7561806440353394, "rewards/rejected": 3.351499319076538, "step": 2497 }, { "epoch": 0.55, "learning_rate": 8.49089895307537e-06, "logits/chosen": -1.5136743783950806, "logits/rejected": -1.547688364982605, "logps/chosen": -75.63180541992188, "logps/rejected": -108.9464111328125, "loss": 3.235, "rewards/accuracies": 0.0, "rewards/chosen": 3.5084030628204346, "rewards/margins": -1.244619607925415, "rewards/rejected": 4.75302267074585, "step": 2498 }, { "epoch": 0.55, "learning_rate": 8.48961556087661e-06, "logits/chosen": -1.3601635694503784, "logits/rejected": -1.3900139331817627, "logps/chosen": -56.82801818847656, "logps/rejected": -53.91778564453125, "loss": 0.4113, "rewards/accuracies": 0.0, "rewards/chosen": 2.7419419288635254, "rewards/margins": -0.23064565658569336, "rewards/rejected": 2.9725875854492188, "step": 2499 }, { "epoch": 0.55, "learning_rate": 8.48833172027064e-06, "logits/chosen": -1.7572150230407715, "logits/rejected": -1.7622699737548828, "logps/chosen": -80.89122009277344, "logps/rejected": -83.41031646728516, "loss": 0.6064, "rewards/accuracies": 1.0, "rewards/chosen": 8.626663208007812, "rewards/margins": 2.426297664642334, "rewards/rejected": 6.2003655433654785, "step": 2500 }, { "epoch": 0.55, "learning_rate": 8.487047431422426e-06, "logits/chosen": -1.4963772296905518, "logits/rejected": -1.42117440700531, "logps/chosen": -59.89395523071289, "logps/rejected": -54.909122467041016, "loss": 0.2701, "rewards/accuracies": 1.0, "rewards/chosen": 3.2965238094329834, "rewards/margins": 0.3600456714630127, "rewards/rejected": 2.9364781379699707, "step": 2501 }, { "epoch": 0.55, "learning_rate": 8.485762694497001e-06, "logits/chosen": -1.6771191358566284, "logits/rejected": -1.5345804691314697, "logps/chosen": -77.80073547363281, "logps/rejected": -31.11760902404785, "loss": 0.7587, "rewards/accuracies": 0.0, "rewards/chosen": 2.093522787094116, "rewards/margins": -0.6095190048217773, "rewards/rejected": 2.7030417919158936, "step": 2502 }, { "epoch": 0.55, "learning_rate": 8.484477509659452e-06, "logits/chosen": -1.521254539489746, "logits/rejected": -1.521254539489746, "logps/chosen": -77.84092712402344, "logps/rejected": -77.84092712402344, "loss": 0.347, "rewards/accuracies": 0.0, "rewards/chosen": 1.6786972284317017, "rewards/margins": 0.0, "rewards/rejected": 1.6786972284317017, "step": 2503 }, { "epoch": 0.55, "learning_rate": 8.483191877074916e-06, "logits/chosen": -1.2993046045303345, "logits/rejected": -1.2465028762817383, "logps/chosen": -76.2760009765625, "logps/rejected": -65.6961441040039, "loss": 0.8245, "rewards/accuracies": 1.0, "rewards/chosen": 5.52743673324585, "rewards/margins": 2.285611629486084, "rewards/rejected": 3.2418251037597656, "step": 2504 }, { "epoch": 0.55, "learning_rate": 8.4819057969086e-06, "logits/chosen": -1.5663588047027588, "logits/rejected": -1.5433955192565918, "logps/chosen": -140.6620635986328, "logps/rejected": -102.6845703125, "loss": 0.3235, "rewards/accuracies": 1.0, "rewards/chosen": 6.281596660614014, "rewards/margins": 0.1415252685546875, "rewards/rejected": 6.140071392059326, "step": 2505 }, { "epoch": 0.55, "learning_rate": 8.480619269325759e-06, "logits/chosen": -1.4938313961029053, "logits/rejected": -1.462109088897705, "logps/chosen": -68.4146728515625, "logps/rejected": -54.477413177490234, "loss": 1.363, "rewards/accuracies": 1.0, "rewards/chosen": 2.744417667388916, "rewards/margins": 0.9224026203155518, "rewards/rejected": 1.8220150470733643, "step": 2506 }, { "epoch": 0.55, "learning_rate": 8.479332294491707e-06, "logits/chosen": -1.5412890911102295, "logits/rejected": -1.4761608839035034, "logps/chosen": -71.15985870361328, "logps/rejected": -44.696510314941406, "loss": 0.8947, "rewards/accuracies": 0.0, "rewards/chosen": 3.487290143966675, "rewards/margins": -1.144709825515747, "rewards/rejected": 4.631999969482422, "step": 2507 }, { "epoch": 0.56, "learning_rate": 8.47804487257182e-06, "logits/chosen": -1.5545474290847778, "logits/rejected": -1.501312017440796, "logps/chosen": -37.03046417236328, "logps/rejected": -26.715166091918945, "loss": 1.4084, "rewards/accuracies": 0.0, "rewards/chosen": 2.179677724838257, "rewards/margins": -0.8972222805023193, "rewards/rejected": 3.076900005340576, "step": 2508 }, { "epoch": 0.56, "learning_rate": 8.47675700373153e-06, "logits/chosen": -1.359340786933899, "logits/rejected": -1.2658674716949463, "logps/chosen": -50.43262481689453, "logps/rejected": -6.781032085418701, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 5.93098783493042, "rewards/margins": 5.092236518859863, "rewards/rejected": 0.8387514352798462, "step": 2509 }, { "epoch": 0.56, "learning_rate": 8.475468688136322e-06, "logits/chosen": -1.7353135347366333, "logits/rejected": -1.673345685005188, "logps/chosen": -103.8754653930664, "logps/rejected": -105.27500915527344, "loss": 0.317, "rewards/accuracies": 1.0, "rewards/chosen": 6.113964080810547, "rewards/margins": 3.0569069385528564, "rewards/rejected": 3.0570571422576904, "step": 2510 }, { "epoch": 0.56, "learning_rate": 8.47417992595174e-06, "logits/chosen": -1.3639519214630127, "logits/rejected": -1.2734451293945312, "logps/chosen": -108.46275329589844, "logps/rejected": -46.4012565612793, "loss": 0.5205, "rewards/accuracies": 0.0, "rewards/chosen": 6.141944885253906, "rewards/margins": -0.5651397705078125, "rewards/rejected": 6.707084655761719, "step": 2511 }, { "epoch": 0.56, "learning_rate": 8.472890717343391e-06, "logits/chosen": -1.3787862062454224, "logits/rejected": -1.363357424736023, "logps/chosen": -23.421703338623047, "logps/rejected": -42.415592193603516, "loss": 0.5641, "rewards/accuracies": 1.0, "rewards/chosen": 1.3088200092315674, "rewards/margins": 0.606910765171051, "rewards/rejected": 0.7019092440605164, "step": 2512 }, { "epoch": 0.56, "learning_rate": 8.471601062476933e-06, "logits/chosen": -1.5871672630310059, "logits/rejected": -1.394420862197876, "logps/chosen": -77.1881103515625, "logps/rejected": -114.98797607421875, "loss": 1.647, "rewards/accuracies": 0.0, "rewards/chosen": 2.929852247238159, "rewards/margins": -2.610840082168579, "rewards/rejected": 5.540692329406738, "step": 2513 }, { "epoch": 0.56, "learning_rate": 8.470310961518085e-06, "logits/chosen": -1.906851053237915, "logits/rejected": -1.9431471824645996, "logps/chosen": -103.57862091064453, "logps/rejected": -96.34736633300781, "loss": 1.8622, "rewards/accuracies": 0.0, "rewards/chosen": 5.384623050689697, "rewards/margins": -3.628547191619873, "rewards/rejected": 9.01317024230957, "step": 2514 }, { "epoch": 0.56, "learning_rate": 8.469020414632619e-06, "logits/chosen": -1.597458839416504, "logits/rejected": -1.6583114862442017, "logps/chosen": -35.86779022216797, "logps/rejected": -129.85618591308594, "loss": 1.419, "rewards/accuracies": 0.0, "rewards/chosen": 2.88481068611145, "rewards/margins": -2.6813971996307373, "rewards/rejected": 5.5662078857421875, "step": 2515 }, { "epoch": 0.56, "learning_rate": 8.467729421986371e-06, "logits/chosen": -1.3707115650177002, "logits/rejected": -1.479203224182129, "logps/chosen": -76.31362915039062, "logps/rejected": -70.77879333496094, "loss": 1.7514, "rewards/accuracies": 0.0, "rewards/chosen": 2.17779541015625, "rewards/margins": -3.4672555923461914, "rewards/rejected": 5.645051002502441, "step": 2516 }, { "epoch": 0.56, "learning_rate": 8.466437983745227e-06, "logits/chosen": -1.553821325302124, "logits/rejected": -1.5135518312454224, "logps/chosen": -146.92279052734375, "logps/rejected": -107.598388671875, "loss": 0.0337, "rewards/accuracies": 1.0, "rewards/chosen": 8.990979194641113, "rewards/margins": 6.643690586090088, "rewards/rejected": 2.3472886085510254, "step": 2517 }, { "epoch": 0.56, "learning_rate": 8.465146100075136e-06, "logits/chosen": -1.5816373825073242, "logits/rejected": -1.5074394941329956, "logps/chosen": -57.02897644042969, "logps/rejected": -37.60160446166992, "loss": 0.518, "rewards/accuracies": 0.0, "rewards/chosen": 3.8423752784729004, "rewards/margins": -0.5796232223510742, "rewards/rejected": 4.421998500823975, "step": 2518 }, { "epoch": 0.56, "learning_rate": 8.4638537711421e-06, "logits/chosen": -1.9131278991699219, "logits/rejected": -1.9021598100662231, "logps/chosen": -36.47655487060547, "logps/rejected": -61.50140380859375, "loss": 0.7463, "rewards/accuracies": 0.0, "rewards/chosen": 2.562464952468872, "rewards/margins": -1.222205400466919, "rewards/rejected": 3.784670352935791, "step": 2519 }, { "epoch": 0.56, "learning_rate": 8.462560997112184e-06, "logits/chosen": -1.5006725788116455, "logits/rejected": -1.518965721130371, "logps/chosen": -27.555713653564453, "logps/rejected": -62.09370040893555, "loss": 0.5642, "rewards/accuracies": 0.0, "rewards/chosen": 2.739570379257202, "rewards/margins": -0.5214347839355469, "rewards/rejected": 3.261005163192749, "step": 2520 }, { "epoch": 0.56, "learning_rate": 8.4612677781515e-06, "logits/chosen": -1.6094657182693481, "logits/rejected": -1.4901689291000366, "logps/chosen": -90.69536590576172, "logps/rejected": -72.63735961914062, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": 7.520609378814697, "rewards/margins": 3.754723310470581, "rewards/rejected": 3.765886068344116, "step": 2521 }, { "epoch": 0.56, "learning_rate": 8.45997411442623e-06, "logits/chosen": -1.632680058479309, "logits/rejected": -1.6010807752609253, "logps/chosen": -70.6534194946289, "logps/rejected": -60.91263961791992, "loss": 0.5039, "rewards/accuracies": 1.0, "rewards/chosen": 3.7650139331817627, "rewards/margins": 0.008723735809326172, "rewards/rejected": 3.7562901973724365, "step": 2522 }, { "epoch": 0.56, "learning_rate": 8.458680006102602e-06, "logits/chosen": -1.8332525491714478, "logits/rejected": -1.7364604473114014, "logps/chosen": -73.56515502929688, "logps/rejected": -67.68972778320312, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": 5.812115669250488, "rewards/margins": 3.049617052078247, "rewards/rejected": 2.762498617172241, "step": 2523 }, { "epoch": 0.56, "learning_rate": 8.45738545334691e-06, "logits/chosen": -1.6103938817977905, "logits/rejected": -1.5769706964492798, "logps/chosen": -40.5458869934082, "logps/rejected": -40.73674392700195, "loss": 0.3681, "rewards/accuracies": 1.0, "rewards/chosen": 2.234419584274292, "rewards/margins": 0.4342299699783325, "rewards/rejected": 1.8001896142959595, "step": 2524 }, { "epoch": 0.56, "learning_rate": 8.456090456325496e-06, "logits/chosen": -1.3715366125106812, "logits/rejected": -1.3404200077056885, "logps/chosen": -120.75323486328125, "logps/rejected": -54.97407913208008, "loss": 0.3828, "rewards/accuracies": 1.0, "rewards/chosen": 4.800015449523926, "rewards/margins": 1.5037450790405273, "rewards/rejected": 3.2962703704833984, "step": 2525 }, { "epoch": 0.56, "learning_rate": 8.454795015204767e-06, "logits/chosen": -1.3278855085372925, "logits/rejected": -1.2721244096755981, "logps/chosen": -37.16820526123047, "logps/rejected": -29.17522430419922, "loss": 0.2956, "rewards/accuracies": 1.0, "rewards/chosen": 2.4489002227783203, "rewards/margins": 0.2682037353515625, "rewards/rejected": 2.180696487426758, "step": 2526 }, { "epoch": 0.56, "learning_rate": 8.453499130151183e-06, "logits/chosen": -1.6815402507781982, "logits/rejected": -1.552594542503357, "logps/chosen": -129.89120483398438, "logps/rejected": -57.41587448120117, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 7.834973335266113, "rewards/margins": 4.481654167175293, "rewards/rejected": 3.3533191680908203, "step": 2527 }, { "epoch": 0.56, "learning_rate": 8.452202801331265e-06, "logits/chosen": -1.7031487226486206, "logits/rejected": -1.5984582901000977, "logps/chosen": -37.515167236328125, "logps/rejected": -18.502513885498047, "loss": 1.0104, "rewards/accuracies": 1.0, "rewards/chosen": 2.2618472576141357, "rewards/margins": 1.917250394821167, "rewards/rejected": 0.34459686279296875, "step": 2528 }, { "epoch": 0.56, "learning_rate": 8.450906028911585e-06, "logits/chosen": -1.447306752204895, "logits/rejected": -1.4303781986236572, "logps/chosen": -49.27672576904297, "logps/rejected": -50.10793685913086, "loss": 0.7623, "rewards/accuracies": 1.0, "rewards/chosen": 2.3386573791503906, "rewards/margins": 0.05547595024108887, "rewards/rejected": 2.2831814289093018, "step": 2529 }, { "epoch": 0.56, "learning_rate": 8.449608813058776e-06, "logits/chosen": -1.174609899520874, "logits/rejected": -1.1436772346496582, "logps/chosen": -39.84910583496094, "logps/rejected": -43.29928207397461, "loss": 2.6069, "rewards/accuracies": 0.0, "rewards/chosen": 2.222885847091675, "rewards/margins": -1.7793028354644775, "rewards/rejected": 4.002188682556152, "step": 2530 }, { "epoch": 0.56, "learning_rate": 8.448311153939527e-06, "logits/chosen": -1.556969404220581, "logits/rejected": -1.4651554822921753, "logps/chosen": -38.05265426635742, "logps/rejected": -23.33966064453125, "loss": 0.0671, "rewards/accuracies": 1.0, "rewards/chosen": 4.169898509979248, "rewards/margins": 3.1412405967712402, "rewards/rejected": 1.0286579132080078, "step": 2531 }, { "epoch": 0.56, "learning_rate": 8.447013051720585e-06, "logits/chosen": -1.6723518371582031, "logits/rejected": -1.6445338726043701, "logps/chosen": -75.27015686035156, "logps/rejected": -68.8209457397461, "loss": 0.2664, "rewards/accuracies": 1.0, "rewards/chosen": 3.4229278564453125, "rewards/margins": 0.3914473056793213, "rewards/rejected": 3.031480550765991, "step": 2532 }, { "epoch": 0.56, "learning_rate": 8.445714506568751e-06, "logits/chosen": -1.3216866254806519, "logits/rejected": -1.2384274005889893, "logps/chosen": -73.18244171142578, "logps/rejected": -26.710315704345703, "loss": 0.4564, "rewards/accuracies": 1.0, "rewards/chosen": 2.903118133544922, "rewards/margins": 0.5681140422821045, "rewards/rejected": 2.3350040912628174, "step": 2533 }, { "epoch": 0.56, "learning_rate": 8.444415518650887e-06, "logits/chosen": -1.4719451665878296, "logits/rejected": -1.3465030193328857, "logps/chosen": -157.46484375, "logps/rejected": -62.988563537597656, "loss": 0.5863, "rewards/accuracies": 1.0, "rewards/chosen": 6.16414213180542, "rewards/margins": 0.9241619110107422, "rewards/rejected": 5.239980220794678, "step": 2534 }, { "epoch": 0.56, "learning_rate": 8.443116088133908e-06, "logits/chosen": -1.4896864891052246, "logits/rejected": -1.5125874280929565, "logps/chosen": -36.57331848144531, "logps/rejected": -60.187164306640625, "loss": 0.5726, "rewards/accuracies": 0.0, "rewards/chosen": 3.8324525356292725, "rewards/margins": -0.7116239070892334, "rewards/rejected": 4.544076442718506, "step": 2535 }, { "epoch": 0.56, "learning_rate": 8.44181621518479e-06, "logits/chosen": -1.855368971824646, "logits/rejected": -1.8305455446243286, "logps/chosen": -75.67581176757812, "logps/rejected": -70.67636108398438, "loss": 1.0036, "rewards/accuracies": 0.0, "rewards/chosen": 2.4737548828125, "rewards/margins": -0.7908456325531006, "rewards/rejected": 3.2646005153656006, "step": 2536 }, { "epoch": 0.56, "learning_rate": 8.440515899970561e-06, "logits/chosen": -1.170851707458496, "logits/rejected": -1.0424706935882568, "logps/chosen": -144.57473754882812, "logps/rejected": -79.15953826904297, "loss": 0.1572, "rewards/accuracies": 1.0, "rewards/chosen": 5.57173490524292, "rewards/margins": 1.0247726440429688, "rewards/rejected": 4.546962261199951, "step": 2537 }, { "epoch": 0.56, "learning_rate": 8.43921514265831e-06, "logits/chosen": -1.968793272972107, "logits/rejected": -1.9271724224090576, "logps/chosen": -45.755348205566406, "logps/rejected": -25.212173461914062, "loss": 2.0232, "rewards/accuracies": 1.0, "rewards/chosen": 2.537792921066284, "rewards/margins": 1.8155715465545654, "rewards/rejected": 0.7222213745117188, "step": 2538 }, { "epoch": 0.56, "learning_rate": 8.437913943415181e-06, "logits/chosen": -1.6968286037445068, "logits/rejected": -1.6938292980194092, "logps/chosen": -126.4719009399414, "logps/rejected": -104.27894592285156, "loss": 0.2465, "rewards/accuracies": 1.0, "rewards/chosen": 6.255691051483154, "rewards/margins": 0.5374765396118164, "rewards/rejected": 5.718214511871338, "step": 2539 }, { "epoch": 0.56, "learning_rate": 8.436612302408376e-06, "logits/chosen": -1.4774919748306274, "logits/rejected": -1.4103800058364868, "logps/chosen": -61.74319839477539, "logps/rejected": -60.748321533203125, "loss": 0.9824, "rewards/accuracies": 1.0, "rewards/chosen": 3.1433026790618896, "rewards/margins": 0.7686383724212646, "rewards/rejected": 2.374664306640625, "step": 2540 }, { "epoch": 0.56, "learning_rate": 8.43531021980515e-06, "logits/chosen": -1.8039342164993286, "logits/rejected": -1.7645978927612305, "logps/chosen": -60.37351989746094, "logps/rejected": -48.33040237426758, "loss": 0.3207, "rewards/accuracies": 1.0, "rewards/chosen": 3.00895619392395, "rewards/margins": 0.3213620185852051, "rewards/rejected": 2.687594175338745, "step": 2541 }, { "epoch": 0.56, "learning_rate": 8.434007695772819e-06, "logits/chosen": -1.423344373703003, "logits/rejected": -1.423344373703003, "logps/chosen": -63.494239807128906, "logps/rejected": -63.494239807128906, "loss": 0.5846, "rewards/accuracies": 0.0, "rewards/chosen": 4.196229457855225, "rewards/margins": 0.0, "rewards/rejected": 4.196229457855225, "step": 2542 }, { "epoch": 0.56, "learning_rate": 8.432704730478756e-06, "logits/chosen": -1.602089524269104, "logits/rejected": -1.5813581943511963, "logps/chosen": -55.55863952636719, "logps/rejected": -65.23764038085938, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 3.3821732997894287, "rewards/margins": 0.03344130516052246, "rewards/rejected": 3.3487319946289062, "step": 2543 }, { "epoch": 0.56, "learning_rate": 8.431401324090384e-06, "logits/chosen": -1.5290509462356567, "logits/rejected": -1.3549840450286865, "logps/chosen": -141.46372985839844, "logps/rejected": -56.602806091308594, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": 6.796220302581787, "rewards/margins": 4.273297309875488, "rewards/rejected": 2.522923231124878, "step": 2544 }, { "epoch": 0.56, "learning_rate": 8.430097476775194e-06, "logits/chosen": -1.5461889505386353, "logits/rejected": -1.3746908903121948, "logps/chosen": -143.282958984375, "logps/rejected": -48.46835708618164, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": 6.075904846191406, "rewards/margins": 4.726210117340088, "rewards/rejected": 1.349694848060608, "step": 2545 }, { "epoch": 0.56, "learning_rate": 8.428793188700722e-06, "logits/chosen": -1.3426942825317383, "logits/rejected": -1.5839245319366455, "logps/chosen": -44.35181427001953, "logps/rejected": -31.44076156616211, "loss": 0.3078, "rewards/accuracies": 1.0, "rewards/chosen": 3.308295488357544, "rewards/margins": 1.410312294960022, "rewards/rejected": 1.897983193397522, "step": 2546 }, { "epoch": 0.56, "learning_rate": 8.427488460034567e-06, "logits/chosen": -1.5391093492507935, "logits/rejected": -1.4668502807617188, "logps/chosen": -38.46570587158203, "logps/rejected": -13.960514068603516, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": 2.569687604904175, "rewards/margins": 1.9927711486816406, "rewards/rejected": 0.576916515827179, "step": 2547 }, { "epoch": 0.56, "learning_rate": 8.426183290944387e-06, "logits/chosen": -1.4944241046905518, "logits/rejected": -1.3771858215332031, "logps/chosen": -57.31611251831055, "logps/rejected": -40.194313049316406, "loss": 0.3115, "rewards/accuracies": 1.0, "rewards/chosen": 3.568429946899414, "rewards/margins": 0.8296122550964355, "rewards/rejected": 2.7388176918029785, "step": 2548 }, { "epoch": 0.56, "learning_rate": 8.424877681597889e-06, "logits/chosen": -1.3772839307785034, "logits/rejected": -1.306752324104309, "logps/chosen": -105.88571166992188, "logps/rejected": -91.65573120117188, "loss": 0.2888, "rewards/accuracies": 1.0, "rewards/chosen": 7.699505805969238, "rewards/margins": 1.500823974609375, "rewards/rejected": 6.198681831359863, "step": 2549 }, { "epoch": 0.56, "learning_rate": 8.423571632162843e-06, "logits/chosen": -1.2145600318908691, "logits/rejected": -1.1104222536087036, "logps/chosen": -59.47734069824219, "logps/rejected": -8.336626052856445, "loss": 0.6996, "rewards/accuracies": 1.0, "rewards/chosen": 3.3451035022735596, "rewards/margins": 2.2127466201782227, "rewards/rejected": 1.132356882095337, "step": 2550 }, { "epoch": 0.56, "learning_rate": 8.422265142807071e-06, "logits/chosen": -1.400291919708252, "logits/rejected": -1.400291919708252, "logps/chosen": -84.08572387695312, "logps/rejected": -84.08572387695312, "loss": 0.8018, "rewards/accuracies": 0.0, "rewards/chosen": 4.358923435211182, "rewards/margins": 0.0, "rewards/rejected": 4.358923435211182, "step": 2551 }, { "epoch": 0.56, "learning_rate": 8.420958213698455e-06, "logits/chosen": -1.7791775465011597, "logits/rejected": -1.7791775465011597, "logps/chosen": -58.955413818359375, "logps/rejected": -58.955413818359375, "loss": 0.574, "rewards/accuracies": 0.0, "rewards/chosen": 6.090001583099365, "rewards/margins": 0.0, "rewards/rejected": 6.090001583099365, "step": 2552 }, { "epoch": 0.57, "learning_rate": 8.419650845004932e-06, "logits/chosen": -1.3618135452270508, "logits/rejected": -1.3205976486206055, "logps/chosen": -135.65670776367188, "logps/rejected": -125.3407211303711, "loss": 0.7911, "rewards/accuracies": 0.0, "rewards/chosen": 4.57936429977417, "rewards/margins": -1.3367071151733398, "rewards/rejected": 5.91607141494751, "step": 2553 }, { "epoch": 0.57, "learning_rate": 8.418343036894497e-06, "logits/chosen": -1.4364676475524902, "logits/rejected": -1.4364676475524902, "logps/chosen": -65.05477905273438, "logps/rejected": -65.05477905273438, "loss": 0.5192, "rewards/accuracies": 0.0, "rewards/chosen": 2.9461381435394287, "rewards/margins": 0.0, "rewards/rejected": 2.9461381435394287, "step": 2554 }, { "epoch": 0.57, "learning_rate": 8.4170347895352e-06, "logits/chosen": -1.625858187675476, "logits/rejected": -1.6084481477737427, "logps/chosen": -49.01589584350586, "logps/rejected": -58.388275146484375, "loss": 0.8031, "rewards/accuracies": 0.0, "rewards/chosen": 1.2316070795059204, "rewards/margins": -1.1652861833572388, "rewards/rejected": 2.396893262863159, "step": 2555 }, { "epoch": 0.57, "learning_rate": 8.415726103095146e-06, "logits/chosen": -1.7455962896347046, "logits/rejected": -1.4535644054412842, "logps/chosen": -80.96080780029297, "logps/rejected": -169.95738220214844, "loss": 0.552, "rewards/accuracies": 0.0, "rewards/chosen": 2.6220619678497314, "rewards/margins": -0.697385311126709, "rewards/rejected": 3.3194472789764404, "step": 2556 }, { "epoch": 0.57, "learning_rate": 8.414416977742498e-06, "logits/chosen": -1.6581509113311768, "logits/rejected": -1.6635329723358154, "logps/chosen": -48.215660095214844, "logps/rejected": -65.7013931274414, "loss": 0.3985, "rewards/accuracies": 1.0, "rewards/chosen": 3.9494972229003906, "rewards/margins": 0.05321645736694336, "rewards/rejected": 3.8962807655334473, "step": 2557 }, { "epoch": 0.57, "learning_rate": 8.413107413645477e-06, "logits/chosen": -1.6672446727752686, "logits/rejected": -1.655971884727478, "logps/chosen": -162.11215209960938, "logps/rejected": -133.72232055664062, "loss": 0.7137, "rewards/accuracies": 0.0, "rewards/chosen": 7.5810227394104, "rewards/margins": -1.1501145362854004, "rewards/rejected": 8.7311372756958, "step": 2558 }, { "epoch": 0.57, "learning_rate": 8.411797410972358e-06, "logits/chosen": -1.6863313913345337, "logits/rejected": -1.6926953792572021, "logps/chosen": -70.84113311767578, "logps/rejected": -58.96314239501953, "loss": 0.9333, "rewards/accuracies": 0.0, "rewards/chosen": 2.1729416847229004, "rewards/margins": -1.6851952075958252, "rewards/rejected": 3.8581368923187256, "step": 2559 }, { "epoch": 0.57, "learning_rate": 8.410486969891475e-06, "logits/chosen": -1.6337800025939941, "logits/rejected": -1.6437081098556519, "logps/chosen": -95.72036743164062, "logps/rejected": -30.715667724609375, "loss": 2.5476, "rewards/accuracies": 1.0, "rewards/chosen": 3.438018798828125, "rewards/margins": 0.43813443183898926, "rewards/rejected": 2.9998843669891357, "step": 2560 }, { "epoch": 0.57, "learning_rate": 8.409176090571214e-06, "logits/chosen": -1.6532151699066162, "logits/rejected": -1.6820743083953857, "logps/chosen": -83.2723388671875, "logps/rejected": -79.21269226074219, "loss": 0.9234, "rewards/accuracies": 1.0, "rewards/chosen": 6.84346342086792, "rewards/margins": 0.4935932159423828, "rewards/rejected": 6.349870204925537, "step": 2561 }, { "epoch": 0.57, "learning_rate": 8.40786477318002e-06, "logits/chosen": -1.6638696193695068, "logits/rejected": -1.58493173122406, "logps/chosen": -92.50285339355469, "logps/rejected": -64.8235092163086, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 6.534951686859131, "rewards/margins": 3.667309522628784, "rewards/rejected": 2.8676421642303467, "step": 2562 }, { "epoch": 0.57, "learning_rate": 8.406553017886397e-06, "logits/chosen": -1.8029112815856934, "logits/rejected": -1.7189794778823853, "logps/chosen": -83.58724212646484, "logps/rejected": -33.089942932128906, "loss": 1.6956, "rewards/accuracies": 1.0, "rewards/chosen": 7.329361915588379, "rewards/margins": 4.335148811340332, "rewards/rejected": 2.994213104248047, "step": 2563 }, { "epoch": 0.57, "learning_rate": 8.405240824858898e-06, "logits/chosen": -1.6272363662719727, "logits/rejected": -1.659846544265747, "logps/chosen": -122.50730895996094, "logps/rejected": -125.59757995605469, "loss": 0.1571, "rewards/accuracies": 1.0, "rewards/chosen": 8.558201789855957, "rewards/margins": 1.1649508476257324, "rewards/rejected": 7.393250942230225, "step": 2564 }, { "epoch": 0.57, "learning_rate": 8.40392819426614e-06, "logits/chosen": -1.6839441061019897, "logits/rejected": -0.9466643333435059, "logps/chosen": -135.2148895263672, "logps/rejected": -122.52449035644531, "loss": 2.0658, "rewards/accuracies": 0.0, "rewards/chosen": 5.795485019683838, "rewards/margins": -0.10496807098388672, "rewards/rejected": 5.900453090667725, "step": 2565 }, { "epoch": 0.57, "learning_rate": 8.402615126276792e-06, "logits/chosen": -1.6996254920959473, "logits/rejected": -1.6783583164215088, "logps/chosen": -57.22196578979492, "logps/rejected": -62.04595947265625, "loss": 0.5541, "rewards/accuracies": 0.0, "rewards/chosen": 3.883589506149292, "rewards/margins": -0.28054022789001465, "rewards/rejected": 4.164129734039307, "step": 2566 }, { "epoch": 0.57, "learning_rate": 8.40130162105958e-06, "logits/chosen": -1.777596116065979, "logits/rejected": -1.7320001125335693, "logps/chosen": -122.11135864257812, "logps/rejected": -55.727500915527344, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": 7.446206569671631, "rewards/margins": 3.2085723876953125, "rewards/rejected": 4.237634181976318, "step": 2567 }, { "epoch": 0.57, "learning_rate": 8.399987678783285e-06, "logits/chosen": -1.4175431728363037, "logits/rejected": -1.389269232749939, "logps/chosen": -44.45951461791992, "logps/rejected": -50.845279693603516, "loss": 0.3385, "rewards/accuracies": 1.0, "rewards/chosen": 2.4980862140655518, "rewards/margins": 0.06907343864440918, "rewards/rejected": 2.4290127754211426, "step": 2568 }, { "epoch": 0.57, "learning_rate": 8.398673299616747e-06, "logits/chosen": -1.7776446342468262, "logits/rejected": -1.7355554103851318, "logps/chosen": -87.88794708251953, "logps/rejected": -68.58891296386719, "loss": 0.8493, "rewards/accuracies": 0.0, "rewards/chosen": 5.402719974517822, "rewards/margins": -1.0225229263305664, "rewards/rejected": 6.425242900848389, "step": 2569 }, { "epoch": 0.57, "learning_rate": 8.397358483728861e-06, "logits/chosen": -1.7838335037231445, "logits/rejected": -1.7032188177108765, "logps/chosen": -130.7625732421875, "logps/rejected": -74.41044616699219, "loss": 0.5387, "rewards/accuracies": 1.0, "rewards/chosen": 6.117290019989014, "rewards/margins": 2.198849678039551, "rewards/rejected": 3.918440341949463, "step": 2570 }, { "epoch": 0.57, "learning_rate": 8.396043231288577e-06, "logits/chosen": -1.73550283908844, "logits/rejected": -1.7472944259643555, "logps/chosen": -93.5339126586914, "logps/rejected": -81.4658203125, "loss": 2.9751, "rewards/accuracies": 0.0, "rewards/chosen": 2.7494590282440186, "rewards/margins": -0.8124291896820068, "rewards/rejected": 3.5618882179260254, "step": 2571 }, { "epoch": 0.57, "learning_rate": 8.3947275424649e-06, "logits/chosen": -1.5158137083053589, "logits/rejected": -1.5573337078094482, "logps/chosen": -58.87857437133789, "logps/rejected": -48.24168395996094, "loss": 0.1713, "rewards/accuracies": 1.0, "rewards/chosen": 3.3981564044952393, "rewards/margins": 1.0403797626495361, "rewards/rejected": 2.357776641845703, "step": 2572 }, { "epoch": 0.57, "learning_rate": 8.393411417426895e-06, "logits/chosen": -1.6086654663085938, "logits/rejected": -1.6117005348205566, "logps/chosen": -36.781402587890625, "logps/rejected": -37.96835708618164, "loss": 0.6902, "rewards/accuracies": 0.0, "rewards/chosen": 2.6954610347747803, "rewards/margins": -0.9473981857299805, "rewards/rejected": 3.6428592205047607, "step": 2573 }, { "epoch": 0.57, "learning_rate": 8.392094856343682e-06, "logits/chosen": -1.6684714555740356, "logits/rejected": -1.6534292697906494, "logps/chosen": -68.64674377441406, "logps/rejected": -65.99968719482422, "loss": 0.494, "rewards/accuracies": 1.0, "rewards/chosen": 4.217752933502197, "rewards/margins": 1.6087234020233154, "rewards/rejected": 2.609029531478882, "step": 2574 }, { "epoch": 0.57, "learning_rate": 8.390777859384434e-06, "logits/chosen": -1.297320008277893, "logits/rejected": -1.160374402999878, "logps/chosen": -20.281496047973633, "logps/rejected": -34.90703582763672, "loss": 0.1539, "rewards/accuracies": 1.0, "rewards/chosen": 1.578133225440979, "rewards/margins": 1.0294828414916992, "rewards/rejected": 0.5486503839492798, "step": 2575 }, { "epoch": 0.57, "learning_rate": 8.38946042671838e-06, "logits/chosen": -1.2709194421768188, "logits/rejected": -1.2709194421768188, "logps/chosen": -49.715293884277344, "logps/rejected": -49.715293884277344, "loss": 0.6758, "rewards/accuracies": 0.0, "rewards/chosen": 2.4763710498809814, "rewards/margins": 0.0, "rewards/rejected": 2.4763710498809814, "step": 2576 }, { "epoch": 0.57, "learning_rate": 8.388142558514811e-06, "logits/chosen": -1.595200538635254, "logits/rejected": -1.4892008304595947, "logps/chosen": -71.19343566894531, "logps/rejected": -18.66998863220215, "loss": 0.0737, "rewards/accuracies": 1.0, "rewards/chosen": 3.1071717739105225, "rewards/margins": 2.3327577114105225, "rewards/rejected": 0.7744140625, "step": 2577 }, { "epoch": 0.57, "learning_rate": 8.38682425494307e-06, "logits/chosen": -1.4074751138687134, "logits/rejected": -1.356858253479004, "logps/chosen": -36.20153045654297, "logps/rejected": -19.676197052001953, "loss": 1.2674, "rewards/accuracies": 0.0, "rewards/chosen": 1.680219292640686, "rewards/margins": -0.8600808382034302, "rewards/rejected": 2.540300130844116, "step": 2578 }, { "epoch": 0.57, "learning_rate": 8.38550551617255e-06, "logits/chosen": -1.5665662288665771, "logits/rejected": -1.5294336080551147, "logps/chosen": -65.39985656738281, "logps/rejected": -63.16265869140625, "loss": 1.4386, "rewards/accuracies": 1.0, "rewards/chosen": 1.8115860223770142, "rewards/margins": 0.5256149768829346, "rewards/rejected": 1.2859710454940796, "step": 2579 }, { "epoch": 0.57, "learning_rate": 8.384186342372711e-06, "logits/chosen": -1.6330676078796387, "logits/rejected": -1.557887315750122, "logps/chosen": -59.55104064941406, "logps/rejected": -2.930544137954712, "loss": 1.3422, "rewards/accuracies": 1.0, "rewards/chosen": 3.2448928356170654, "rewards/margins": 2.491511583328247, "rewards/rejected": 0.7533811926841736, "step": 2580 }, { "epoch": 0.57, "learning_rate": 8.382866733713064e-06, "logits/chosen": -1.6861695051193237, "logits/rejected": -1.6966040134429932, "logps/chosen": -35.176334381103516, "logps/rejected": -51.43657684326172, "loss": 0.7177, "rewards/accuracies": 0.0, "rewards/chosen": 2.6873745918273926, "rewards/margins": -1.155555248260498, "rewards/rejected": 3.8429298400878906, "step": 2581 }, { "epoch": 0.57, "learning_rate": 8.381546690363174e-06, "logits/chosen": -1.4078518152236938, "logits/rejected": -1.359728455543518, "logps/chosen": -94.22310638427734, "logps/rejected": -43.933292388916016, "loss": 0.337, "rewards/accuracies": 1.0, "rewards/chosen": 5.611257076263428, "rewards/margins": 0.0420832633972168, "rewards/rejected": 5.569173812866211, "step": 2582 }, { "epoch": 0.57, "learning_rate": 8.380226212492661e-06, "logits/chosen": -1.41848623752594, "logits/rejected": -1.431939959526062, "logps/chosen": -43.67701721191406, "logps/rejected": -34.16132736206055, "loss": 0.5942, "rewards/accuracies": 0.0, "rewards/chosen": 2.577702283859253, "rewards/margins": -0.6409122943878174, "rewards/rejected": 3.2186145782470703, "step": 2583 }, { "epoch": 0.57, "learning_rate": 8.378905300271207e-06, "logits/chosen": -1.4870275259017944, "logits/rejected": -1.4870275259017944, "logps/chosen": -59.90303421020508, "logps/rejected": -59.90303421020508, "loss": 0.7318, "rewards/accuracies": 0.0, "rewards/chosen": 1.6339260339736938, "rewards/margins": 0.0, "rewards/rejected": 1.6339260339736938, "step": 2584 }, { "epoch": 0.57, "learning_rate": 8.377583953868545e-06, "logits/chosen": -1.396996259689331, "logits/rejected": -1.2189502716064453, "logps/chosen": -45.78603744506836, "logps/rejected": -36.46024703979492, "loss": 0.084, "rewards/accuracies": 1.0, "rewards/chosen": 2.173311233520508, "rewards/margins": 2.008035659790039, "rewards/rejected": 0.16527557373046875, "step": 2585 }, { "epoch": 0.57, "learning_rate": 8.376262173454464e-06, "logits/chosen": -1.5836292505264282, "logits/rejected": -1.5753024816513062, "logps/chosen": -52.862823486328125, "logps/rejected": -50.86884307861328, "loss": 0.487, "rewards/accuracies": 0.0, "rewards/chosen": 2.618443250656128, "rewards/margins": -0.293642520904541, "rewards/rejected": 2.912085771560669, "step": 2586 }, { "epoch": 0.57, "learning_rate": 8.374939959198809e-06, "logits/chosen": -1.3688973188400269, "logits/rejected": -1.4157793521881104, "logps/chosen": -66.14408874511719, "logps/rejected": -74.93856811523438, "loss": 2.6782, "rewards/accuracies": 0.0, "rewards/chosen": 2.9594452381134033, "rewards/margins": -5.240683555603027, "rewards/rejected": 8.200128555297852, "step": 2587 }, { "epoch": 0.57, "learning_rate": 8.373617311271483e-06, "logits/chosen": -1.6314150094985962, "logits/rejected": -1.6039375066757202, "logps/chosen": -235.8384246826172, "logps/rejected": -106.31242370605469, "loss": 0.1618, "rewards/accuracies": 1.0, "rewards/chosen": 7.023002624511719, "rewards/margins": 2.0596799850463867, "rewards/rejected": 4.963322639465332, "step": 2588 }, { "epoch": 0.57, "learning_rate": 8.372294229842442e-06, "logits/chosen": -1.4530279636383057, "logits/rejected": -1.4345169067382812, "logps/chosen": -33.387733459472656, "logps/rejected": -26.47152328491211, "loss": 0.6763, "rewards/accuracies": 0.0, "rewards/chosen": 2.23445200920105, "rewards/margins": -0.22394418716430664, "rewards/rejected": 2.4583961963653564, "step": 2589 }, { "epoch": 0.57, "learning_rate": 8.3709707150817e-06, "logits/chosen": -1.4611159563064575, "logits/rejected": -1.413231611251831, "logps/chosen": -87.21435546875, "logps/rejected": -85.40091705322266, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 5.654208660125732, "rewards/margins": 2.60262393951416, "rewards/rejected": 3.0515847206115723, "step": 2590 }, { "epoch": 0.57, "learning_rate": 8.369646767159325e-06, "logits/chosen": -1.4270035028457642, "logits/rejected": -1.3300684690475464, "logps/chosen": -57.35125732421875, "logps/rejected": -46.92729949951172, "loss": 0.7238, "rewards/accuracies": 0.0, "rewards/chosen": 2.1133811473846436, "rewards/margins": -1.0709915161132812, "rewards/rejected": 3.184372663497925, "step": 2591 }, { "epoch": 0.57, "learning_rate": 8.36832238624544e-06, "logits/chosen": -1.541730523109436, "logits/rejected": -1.5145939588546753, "logps/chosen": -39.94163131713867, "logps/rejected": -65.020263671875, "loss": 0.1966, "rewards/accuracies": 1.0, "rewards/chosen": 2.5483434200286865, "rewards/margins": 0.7417844533920288, "rewards/rejected": 1.8065589666366577, "step": 2592 }, { "epoch": 0.57, "learning_rate": 8.366997572510228e-06, "logits/chosen": -1.5954385995864868, "logits/rejected": -1.522674560546875, "logps/chosen": -71.01042938232422, "logps/rejected": -56.89918518066406, "loss": 0.5917, "rewards/accuracies": 0.0, "rewards/chosen": 2.838332414627075, "rewards/margins": -0.7309563159942627, "rewards/rejected": 3.569288730621338, "step": 2593 }, { "epoch": 0.57, "learning_rate": 8.365672326123918e-06, "logits/chosen": -1.5382460355758667, "logits/rejected": -1.5399867296218872, "logps/chosen": -52.611900329589844, "logps/rejected": -47.735260009765625, "loss": 0.7228, "rewards/accuracies": 0.0, "rewards/chosen": 1.6308937072753906, "rewards/margins": -1.1680092811584473, "rewards/rejected": 2.798902988433838, "step": 2594 }, { "epoch": 0.57, "learning_rate": 8.364346647256808e-06, "logits/chosen": -1.5366122722625732, "logits/rejected": -1.5366122722625732, "logps/chosen": -61.94520568847656, "logps/rejected": -61.94520568847656, "loss": 0.8795, "rewards/accuracies": 0.0, "rewards/chosen": 3.0683045387268066, "rewards/margins": 0.0, "rewards/rejected": 3.0683045387268066, "step": 2595 }, { "epoch": 0.57, "learning_rate": 8.36302053607924e-06, "logits/chosen": -1.5953654050827026, "logits/rejected": -1.5610326528549194, "logps/chosen": -116.29219055175781, "logps/rejected": -91.03697967529297, "loss": 0.1949, "rewards/accuracies": 1.0, "rewards/chosen": 5.983987331390381, "rewards/margins": 0.7564306259155273, "rewards/rejected": 5.2275567054748535, "step": 2596 }, { "epoch": 0.57, "learning_rate": 8.361693992761617e-06, "logits/chosen": -1.4755635261535645, "logits/rejected": -1.368425965309143, "logps/chosen": -87.03907775878906, "logps/rejected": -43.82656478881836, "loss": 0.6132, "rewards/accuracies": 1.0, "rewards/chosen": 6.110513210296631, "rewards/margins": 2.1834514141082764, "rewards/rejected": 3.9270617961883545, "step": 2597 }, { "epoch": 0.58, "learning_rate": 8.360367017474398e-06, "logits/chosen": -1.270424246788025, "logits/rejected": -1.2276060581207275, "logps/chosen": -39.84294509887695, "logps/rejected": -29.032806396484375, "loss": 1.2123, "rewards/accuracies": 1.0, "rewards/chosen": 3.5982112884521484, "rewards/margins": 0.38277459144592285, "rewards/rejected": 3.2154366970062256, "step": 2598 }, { "epoch": 0.58, "learning_rate": 8.359039610388096e-06, "logits/chosen": -1.7037018537521362, "logits/rejected": -1.743551254272461, "logps/chosen": -109.39698028564453, "logps/rejected": -57.94482421875, "loss": 0.1794, "rewards/accuracies": 1.0, "rewards/chosen": 6.315532684326172, "rewards/margins": 1.2735047340393066, "rewards/rejected": 5.042027950286865, "step": 2599 }, { "epoch": 0.58, "learning_rate": 8.357711771673278e-06, "logits/chosen": -1.5665074586868286, "logits/rejected": -1.5665074586868286, "logps/chosen": -80.70806884765625, "logps/rejected": -80.70806884765625, "loss": 1.4892, "rewards/accuracies": 0.0, "rewards/chosen": 4.119444370269775, "rewards/margins": 0.0, "rewards/rejected": 4.119444370269775, "step": 2600 }, { "epoch": 0.58, "learning_rate": 8.35638350150057e-06, "logits/chosen": -1.4882915019989014, "logits/rejected": -1.5328203439712524, "logps/chosen": -46.51791000366211, "logps/rejected": -116.87895965576172, "loss": 1.7664, "rewards/accuracies": 0.0, "rewards/chosen": 3.3761022090911865, "rewards/margins": -3.4144670963287354, "rewards/rejected": 6.790569305419922, "step": 2601 }, { "epoch": 0.58, "learning_rate": 8.35505480004065e-06, "logits/chosen": -1.505384087562561, "logits/rejected": -1.3581368923187256, "logps/chosen": -67.59708404541016, "logps/rejected": -35.890357971191406, "loss": 0.4307, "rewards/accuracies": 1.0, "rewards/chosen": 1.0266029834747314, "rewards/margins": 0.9657372236251831, "rewards/rejected": 0.06086578592658043, "step": 2602 }, { "epoch": 0.58, "learning_rate": 8.353725667464254e-06, "logits/chosen": -1.4216797351837158, "logits/rejected": -1.245699405670166, "logps/chosen": -69.4407730102539, "logps/rejected": -20.601184844970703, "loss": 1.9893, "rewards/accuracies": 1.0, "rewards/chosen": 3.4766900539398193, "rewards/margins": 2.6609013080596924, "rewards/rejected": 0.8157886862754822, "step": 2603 }, { "epoch": 0.58, "learning_rate": 8.352396103942171e-06, "logits/chosen": -1.7585567235946655, "logits/rejected": -1.7585567235946655, "logps/chosen": -46.68934631347656, "logps/rejected": -46.68934631347656, "loss": 0.7491, "rewards/accuracies": 0.0, "rewards/chosen": 5.863580226898193, "rewards/margins": 0.0, "rewards/rejected": 5.863580226898193, "step": 2604 }, { "epoch": 0.58, "learning_rate": 8.351066109645248e-06, "logits/chosen": -1.2689961194992065, "logits/rejected": -1.271150827407837, "logps/chosen": -58.564964294433594, "logps/rejected": -69.66828155517578, "loss": 2.0799, "rewards/accuracies": 0.0, "rewards/chosen": 2.7386138439178467, "rewards/margins": -1.5922324657440186, "rewards/rejected": 4.330846309661865, "step": 2605 }, { "epoch": 0.58, "learning_rate": 8.349735684744385e-06, "logits/chosen": -1.4246104955673218, "logits/rejected": -1.3633904457092285, "logps/chosen": -121.14584350585938, "logps/rejected": -68.27810668945312, "loss": 0.4835, "rewards/accuracies": 1.0, "rewards/chosen": 3.812761068344116, "rewards/margins": 0.1596832275390625, "rewards/rejected": 3.6530778408050537, "step": 2606 }, { "epoch": 0.58, "learning_rate": 8.34840482941054e-06, "logits/chosen": -1.660334825515747, "logits/rejected": -1.68003511428833, "logps/chosen": -64.87843322753906, "logps/rejected": -115.03007507324219, "loss": 1.8784, "rewards/accuracies": 0.0, "rewards/chosen": 3.1711807250976562, "rewards/margins": -3.6510558128356934, "rewards/rejected": 6.82223653793335, "step": 2607 }, { "epoch": 0.58, "learning_rate": 8.347073543814723e-06, "logits/chosen": -1.5217363834381104, "logits/rejected": -1.4957313537597656, "logps/chosen": -62.129886627197266, "logps/rejected": -108.69107818603516, "loss": 4.0461, "rewards/accuracies": 0.0, "rewards/chosen": 1.6189892292022705, "rewards/margins": -8.018129348754883, "rewards/rejected": 9.637118339538574, "step": 2608 }, { "epoch": 0.58, "learning_rate": 8.345741828128003e-06, "logits/chosen": -1.6494687795639038, "logits/rejected": -1.5403165817260742, "logps/chosen": -89.01258850097656, "logps/rejected": -39.60140609741211, "loss": 0.3625, "rewards/accuracies": 0.0, "rewards/chosen": 2.471860647201538, "rewards/margins": -0.008860349655151367, "rewards/rejected": 2.4807209968566895, "step": 2609 }, { "epoch": 0.58, "learning_rate": 8.344409682521499e-06, "logits/chosen": -1.4160012006759644, "logits/rejected": -1.4160012006759644, "logps/chosen": -28.490814208984375, "logps/rejected": -28.490814208984375, "loss": 0.3491, "rewards/accuracies": 0.0, "rewards/chosen": 2.8831818103790283, "rewards/margins": 0.0, "rewards/rejected": 2.8831818103790283, "step": 2610 }, { "epoch": 0.58, "learning_rate": 8.343077107166394e-06, "logits/chosen": -1.4912580251693726, "logits/rejected": -1.4612668752670288, "logps/chosen": -34.723236083984375, "logps/rejected": -50.636451721191406, "loss": 0.6078, "rewards/accuracies": 0.0, "rewards/chosen": 2.0484695434570312, "rewards/margins": -0.5408515930175781, "rewards/rejected": 2.5893211364746094, "step": 2611 }, { "epoch": 0.58, "learning_rate": 8.341744102233916e-06, "logits/chosen": -1.6816260814666748, "logits/rejected": -1.6062582731246948, "logps/chosen": -64.28138732910156, "logps/rejected": -30.981876373291016, "loss": 0.0692, "rewards/accuracies": 1.0, "rewards/chosen": 6.381108283996582, "rewards/margins": 3.77282452583313, "rewards/rejected": 2.608283758163452, "step": 2612 }, { "epoch": 0.58, "learning_rate": 8.340410667895352e-06, "logits/chosen": -1.7936948537826538, "logits/rejected": -1.8165029287338257, "logps/chosen": -43.50618362426758, "logps/rejected": -57.90676498413086, "loss": 0.3067, "rewards/accuracies": 1.0, "rewards/chosen": 4.224294662475586, "rewards/margins": 0.27656006813049316, "rewards/rejected": 3.9477345943450928, "step": 2613 }, { "epoch": 0.58, "learning_rate": 8.339076804322048e-06, "logits/chosen": -2.045452117919922, "logits/rejected": -1.99805748462677, "logps/chosen": -101.38500213623047, "logps/rejected": -81.08978271484375, "loss": 1.0211, "rewards/accuracies": 1.0, "rewards/chosen": 4.3701395988464355, "rewards/margins": 1.903822660446167, "rewards/rejected": 2.4663169384002686, "step": 2614 }, { "epoch": 0.58, "learning_rate": 8.337742511685403e-06, "logits/chosen": -1.6378607749938965, "logits/rejected": -1.5353370904922485, "logps/chosen": -115.19552612304688, "logps/rejected": -56.491859436035156, "loss": 1.6446, "rewards/accuracies": 1.0, "rewards/chosen": 6.596543788909912, "rewards/margins": 2.9384191036224365, "rewards/rejected": 3.6581246852874756, "step": 2615 }, { "epoch": 0.58, "learning_rate": 8.336407790156868e-06, "logits/chosen": -1.3076504468917847, "logits/rejected": -1.2694255113601685, "logps/chosen": -56.672996520996094, "logps/rejected": -60.255592346191406, "loss": 0.3241, "rewards/accuracies": 1.0, "rewards/chosen": 2.5687179565429688, "rewards/margins": 0.09367895126342773, "rewards/rejected": 2.475039005279541, "step": 2616 }, { "epoch": 0.58, "learning_rate": 8.335072639907953e-06, "logits/chosen": -1.656678318977356, "logits/rejected": -1.6712723970413208, "logps/chosen": -64.47135925292969, "logps/rejected": -87.12036895751953, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": 3.081366777420044, "rewards/margins": 1.2323006391525269, "rewards/rejected": 1.849066138267517, "step": 2617 }, { "epoch": 0.58, "learning_rate": 8.33373706111022e-06, "logits/chosen": -1.6375378370285034, "logits/rejected": -1.6111305952072144, "logps/chosen": -84.33204650878906, "logps/rejected": -82.27581787109375, "loss": 0.626, "rewards/accuracies": 0.0, "rewards/chosen": 3.9444854259490967, "rewards/margins": -0.11176371574401855, "rewards/rejected": 4.056249141693115, "step": 2618 }, { "epoch": 0.58, "learning_rate": 8.332401053935288e-06, "logits/chosen": -1.5189672708511353, "logits/rejected": -1.5245290994644165, "logps/chosen": -43.31324005126953, "logps/rejected": -80.03727722167969, "loss": 0.4608, "rewards/accuracies": 0.0, "rewards/chosen": 2.650137424468994, "rewards/margins": -0.382659912109375, "rewards/rejected": 3.032797336578369, "step": 2619 }, { "epoch": 0.58, "learning_rate": 8.331064618554834e-06, "logits/chosen": -1.8220510482788086, "logits/rejected": -1.6349871158599854, "logps/chosen": -93.34678649902344, "logps/rejected": -28.447126388549805, "loss": 0.3703, "rewards/accuracies": 1.0, "rewards/chosen": 1.7705177068710327, "rewards/margins": 0.5877248048782349, "rewards/rejected": 1.1827929019927979, "step": 2620 }, { "epoch": 0.58, "learning_rate": 8.329727755140584e-06, "logits/chosen": -1.7667770385742188, "logits/rejected": -1.7249419689178467, "logps/chosen": -69.92729949951172, "logps/rejected": -52.397483825683594, "loss": 0.1163, "rewards/accuracies": 1.0, "rewards/chosen": 4.249774932861328, "rewards/margins": 1.602207899093628, "rewards/rejected": 2.6475670337677, "step": 2621 }, { "epoch": 0.58, "learning_rate": 8.32839046386432e-06, "logits/chosen": -1.7564753293991089, "logits/rejected": -1.783334493637085, "logps/chosen": -189.59434509277344, "logps/rejected": -192.1083526611328, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 7.134947299957275, "rewards/margins": 0.02315664291381836, "rewards/rejected": 7.111790657043457, "step": 2622 }, { "epoch": 0.58, "learning_rate": 8.327052744897883e-06, "logits/chosen": -1.6676521301269531, "logits/rejected": -1.665914535522461, "logps/chosen": -41.802391052246094, "logps/rejected": -79.7294921875, "loss": 1.077, "rewards/accuracies": 0.0, "rewards/chosen": 1.959485650062561, "rewards/margins": -2.023374080657959, "rewards/rejected": 3.9828598499298096, "step": 2623 }, { "epoch": 0.58, "learning_rate": 8.325714598413169e-06, "logits/chosen": -1.6566946506500244, "logits/rejected": -1.6326030492782593, "logps/chosen": -50.406883239746094, "logps/rejected": -52.474632263183594, "loss": 1.1123, "rewards/accuracies": 0.0, "rewards/chosen": 1.2150986194610596, "rewards/margins": -0.7435798645019531, "rewards/rejected": 1.9586784839630127, "step": 2624 }, { "epoch": 0.58, "learning_rate": 8.32437602458212e-06, "logits/chosen": -1.4818823337554932, "logits/rejected": -1.3637850284576416, "logps/chosen": -93.82002258300781, "logps/rejected": -59.724754333496094, "loss": 0.3088, "rewards/accuracies": 1.0, "rewards/chosen": 5.374791145324707, "rewards/margins": 0.16251468658447266, "rewards/rejected": 5.212276458740234, "step": 2625 }, { "epoch": 0.58, "learning_rate": 8.323037023576745e-06, "logits/chosen": -1.4512792825698853, "logits/rejected": -1.4512792825698853, "logps/chosen": -18.446313858032227, "logps/rejected": -18.446313858032227, "loss": 1.215, "rewards/accuracies": 0.0, "rewards/chosen": 2.259885549545288, "rewards/margins": 0.0, "rewards/rejected": 2.259885549545288, "step": 2626 }, { "epoch": 0.58, "learning_rate": 8.3216975955691e-06, "logits/chosen": -1.5431890487670898, "logits/rejected": -1.4232820272445679, "logps/chosen": -71.0102310180664, "logps/rejected": -25.4550724029541, "loss": 0.1694, "rewards/accuracies": 1.0, "rewards/chosen": 1.8627541065216064, "rewards/margins": 1.0159456729888916, "rewards/rejected": 0.8468084335327148, "step": 2627 }, { "epoch": 0.58, "learning_rate": 8.320357740731302e-06, "logits/chosen": -1.7940200567245483, "logits/rejected": -1.7660750150680542, "logps/chosen": -125.38445281982422, "logps/rejected": -53.5262451171875, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 8.093137741088867, "rewards/margins": 3.8910136222839355, "rewards/rejected": 4.202124118804932, "step": 2628 }, { "epoch": 0.58, "learning_rate": 8.319017459235515e-06, "logits/chosen": -1.398862600326538, "logits/rejected": -1.4556113481521606, "logps/chosen": -32.400360107421875, "logps/rejected": -66.18122863769531, "loss": 0.7417, "rewards/accuracies": 0.0, "rewards/chosen": 2.3145949840545654, "rewards/margins": -1.0841012001037598, "rewards/rejected": 3.398696184158325, "step": 2629 }, { "epoch": 0.58, "learning_rate": 8.317676751253961e-06, "logits/chosen": -1.167176604270935, "logits/rejected": -1.1930607557296753, "logps/chosen": -48.93415451049805, "logps/rejected": -43.67611312866211, "loss": 0.477, "rewards/accuracies": 0.0, "rewards/chosen": 1.824679970741272, "rewards/margins": -0.26621854305267334, "rewards/rejected": 2.0908985137939453, "step": 2630 }, { "epoch": 0.58, "learning_rate": 8.316335616958922e-06, "logits/chosen": -1.7980329990386963, "logits/rejected": -1.31058669090271, "logps/chosen": -51.031822204589844, "logps/rejected": -72.5223388671875, "loss": 0.2515, "rewards/accuracies": 1.0, "rewards/chosen": 2.8124711513519287, "rewards/margins": 0.5835831165313721, "rewards/rejected": 2.2288880348205566, "step": 2631 }, { "epoch": 0.58, "learning_rate": 8.314994056522727e-06, "logits/chosen": -1.3782695531845093, "logits/rejected": -1.441209316253662, "logps/chosen": -20.093486785888672, "logps/rejected": -56.61084747314453, "loss": 1.4276, "rewards/accuracies": 0.0, "rewards/chosen": 0.7176223993301392, "rewards/margins": -2.5201616287231445, "rewards/rejected": 3.237783908843994, "step": 2632 }, { "epoch": 0.58, "learning_rate": 8.313652070117765e-06, "logits/chosen": -1.7415671348571777, "logits/rejected": -1.6818594932556152, "logps/chosen": -44.20185089111328, "logps/rejected": -49.49147033691406, "loss": 0.5876, "rewards/accuracies": 0.0, "rewards/chosen": 2.550518035888672, "rewards/margins": -0.036159515380859375, "rewards/rejected": 2.5866775512695312, "step": 2633 }, { "epoch": 0.58, "learning_rate": 8.31230965791648e-06, "logits/chosen": -1.601459264755249, "logits/rejected": -1.5713940858840942, "logps/chosen": -48.78254699707031, "logps/rejected": -48.19953155517578, "loss": 0.1468, "rewards/accuracies": 1.0, "rewards/chosen": 3.5679123401641846, "rewards/margins": 1.1139228343963623, "rewards/rejected": 2.4539895057678223, "step": 2634 }, { "epoch": 0.58, "learning_rate": 8.310966820091364e-06, "logits/chosen": -1.8431072235107422, "logits/rejected": -1.840440034866333, "logps/chosen": -77.16082000732422, "logps/rejected": -53.51411437988281, "loss": 0.8082, "rewards/accuracies": 0.0, "rewards/chosen": 2.8662421703338623, "rewards/margins": -1.3723971843719482, "rewards/rejected": 4.2386393547058105, "step": 2635 }, { "epoch": 0.58, "learning_rate": 8.309623556814972e-06, "logits/chosen": -1.341841220855713, "logits/rejected": -1.4164354801177979, "logps/chosen": -42.459617614746094, "logps/rejected": -77.89785766601562, "loss": 1.7283, "rewards/accuracies": 0.0, "rewards/chosen": 1.7457062005996704, "rewards/margins": -3.4242310523986816, "rewards/rejected": 5.1699371337890625, "step": 2636 }, { "epoch": 0.58, "learning_rate": 8.30827986825991e-06, "logits/chosen": -1.406339168548584, "logits/rejected": -1.470456838607788, "logps/chosen": -64.63249206542969, "logps/rejected": -101.198974609375, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 3.2525222301483154, "rewards/margins": 3.928508758544922, "rewards/rejected": -0.6759864687919617, "step": 2637 }, { "epoch": 0.58, "learning_rate": 8.306935754598838e-06, "logits/chosen": -1.541727900505066, "logits/rejected": -1.4699854850769043, "logps/chosen": -90.84181213378906, "logps/rejected": -79.923828125, "loss": 0.7869, "rewards/accuracies": 1.0, "rewards/chosen": 6.524785041809082, "rewards/margins": 4.076761245727539, "rewards/rejected": 2.448024034500122, "step": 2638 }, { "epoch": 0.58, "learning_rate": 8.305591216004468e-06, "logits/chosen": -1.4312925338745117, "logits/rejected": -1.3996890783309937, "logps/chosen": -53.35462951660156, "logps/rejected": -48.14384460449219, "loss": 1.5159, "rewards/accuracies": 0.0, "rewards/chosen": 1.890496850013733, "rewards/margins": -1.7677870988845825, "rewards/rejected": 3.6582839488983154, "step": 2639 }, { "epoch": 0.58, "learning_rate": 8.304246252649574e-06, "logits/chosen": -1.5185530185699463, "logits/rejected": -1.3663095235824585, "logps/chosen": -125.95130157470703, "logps/rejected": -66.61946868896484, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 6.723296642303467, "rewards/margins": 4.280086517333984, "rewards/rejected": 2.4432098865509033, "step": 2640 }, { "epoch": 0.58, "learning_rate": 8.302900864706982e-06, "logits/chosen": -1.6766659021377563, "logits/rejected": -1.6403634548187256, "logps/chosen": -52.426063537597656, "logps/rejected": -55.744083404541016, "loss": 0.6691, "rewards/accuracies": 0.0, "rewards/chosen": 2.696596622467041, "rewards/margins": -0.935652494430542, "rewards/rejected": 3.632249116897583, "step": 2641 }, { "epoch": 0.58, "learning_rate": 8.301555052349567e-06, "logits/chosen": -1.3433619737625122, "logits/rejected": -1.3673337697982788, "logps/chosen": -66.84417724609375, "logps/rejected": -62.98611831665039, "loss": 1.7101, "rewards/accuracies": 0.0, "rewards/chosen": 2.39725661277771, "rewards/margins": -3.150787591934204, "rewards/rejected": 5.548044204711914, "step": 2642 }, { "epoch": 0.58, "learning_rate": 8.300208815750266e-06, "logits/chosen": -1.5580313205718994, "logits/rejected": -1.5400910377502441, "logps/chosen": -142.0094451904297, "logps/rejected": -127.53767395019531, "loss": 0.3013, "rewards/accuracies": 1.0, "rewards/chosen": 5.641831874847412, "rewards/margins": 0.21469402313232422, "rewards/rejected": 5.427137851715088, "step": 2643 }, { "epoch": 0.59, "learning_rate": 8.298862155082065e-06, "logits/chosen": -1.4029088020324707, "logits/rejected": -1.3188409805297852, "logps/chosen": -72.01510620117188, "logps/rejected": -38.04375076293945, "loss": 0.4022, "rewards/accuracies": 1.0, "rewards/chosen": 2.9856088161468506, "rewards/margins": 0.09902477264404297, "rewards/rejected": 2.8865840435028076, "step": 2644 }, { "epoch": 0.59, "learning_rate": 8.297515070518008e-06, "logits/chosen": -1.315161108970642, "logits/rejected": -1.197974443435669, "logps/chosen": -58.721534729003906, "logps/rejected": -49.25691223144531, "loss": 0.0776, "rewards/accuracies": 1.0, "rewards/chosen": 2.877295732498169, "rewards/margins": 1.8640979528427124, "rewards/rejected": 1.0131977796554565, "step": 2645 }, { "epoch": 0.59, "learning_rate": 8.296167562231192e-06, "logits/chosen": -1.5598299503326416, "logits/rejected": -1.4437812566757202, "logps/chosen": -66.72444152832031, "logps/rejected": -34.842369079589844, "loss": 0.201, "rewards/accuracies": 1.0, "rewards/chosen": 3.597456455230713, "rewards/margins": 0.7075080871582031, "rewards/rejected": 2.8899483680725098, "step": 2646 }, { "epoch": 0.59, "learning_rate": 8.294819630394767e-06, "logits/chosen": -1.6222916841506958, "logits/rejected": -1.6082459688186646, "logps/chosen": -34.61363220214844, "logps/rejected": -56.956939697265625, "loss": 0.2992, "rewards/accuracies": 1.0, "rewards/chosen": 2.255854368209839, "rewards/margins": 0.20492815971374512, "rewards/rejected": 2.0509262084960938, "step": 2647 }, { "epoch": 0.59, "learning_rate": 8.293471275181938e-06, "logits/chosen": -1.1694449186325073, "logits/rejected": -1.1461657285690308, "logps/chosen": -37.60074234008789, "logps/rejected": -50.159461975097656, "loss": 0.7258, "rewards/accuracies": 1.0, "rewards/chosen": 3.485567808151245, "rewards/margins": 0.8724582195281982, "rewards/rejected": 2.613109588623047, "step": 2648 }, { "epoch": 0.59, "learning_rate": 8.292122496765969e-06, "logits/chosen": -1.3861724138259888, "logits/rejected": -1.3861724138259888, "logps/chosen": -51.43609619140625, "logps/rejected": -51.43609619140625, "loss": 0.4484, "rewards/accuracies": 0.0, "rewards/chosen": 5.695013523101807, "rewards/margins": 0.0, "rewards/rejected": 5.695013523101807, "step": 2649 }, { "epoch": 0.59, "learning_rate": 8.290773295320173e-06, "logits/chosen": -1.590512990951538, "logits/rejected": -1.5135581493377686, "logps/chosen": -162.3321990966797, "logps/rejected": -38.395992279052734, "loss": 0.0899, "rewards/accuracies": 1.0, "rewards/chosen": 4.611058235168457, "rewards/margins": 2.747610569000244, "rewards/rejected": 1.8634475469589233, "step": 2650 }, { "epoch": 0.59, "learning_rate": 8.28942367101792e-06, "logits/chosen": -1.483621597290039, "logits/rejected": -1.4896503686904907, "logps/chosen": -80.27888488769531, "logps/rejected": -83.13988494873047, "loss": 0.8831, "rewards/accuracies": 1.0, "rewards/chosen": 5.058660984039307, "rewards/margins": 1.0381455421447754, "rewards/rejected": 4.020515441894531, "step": 2651 }, { "epoch": 0.59, "learning_rate": 8.288073624032634e-06, "logits/chosen": -1.865025281906128, "logits/rejected": -1.8036187887191772, "logps/chosen": -121.50248718261719, "logps/rejected": -73.6964111328125, "loss": 0.2855, "rewards/accuracies": 1.0, "rewards/chosen": 6.572096347808838, "rewards/margins": 2.8943519592285156, "rewards/rejected": 3.6777443885803223, "step": 2652 }, { "epoch": 0.59, "learning_rate": 8.28672315453779e-06, "logits/chosen": -1.3490465879440308, "logits/rejected": -1.3754229545593262, "logps/chosen": -29.546585083007812, "logps/rejected": -60.987403869628906, "loss": 0.5601, "rewards/accuracies": 0.0, "rewards/chosen": 2.1164422035217285, "rewards/margins": -0.7173385620117188, "rewards/rejected": 2.8337807655334473, "step": 2653 }, { "epoch": 0.59, "learning_rate": 8.285372262706922e-06, "logits/chosen": -1.4991761445999146, "logits/rejected": -1.5580047369003296, "logps/chosen": -25.640750885009766, "logps/rejected": -61.985595703125, "loss": 1.4064, "rewards/accuracies": 0.0, "rewards/chosen": 2.1222267150878906, "rewards/margins": -1.9917654991149902, "rewards/rejected": 4.113992214202881, "step": 2654 }, { "epoch": 0.59, "learning_rate": 8.284020948713615e-06, "logits/chosen": -1.4733202457427979, "logits/rejected": -1.5407315492630005, "logps/chosen": -26.390567779541016, "logps/rejected": -70.41358947753906, "loss": 0.6052, "rewards/accuracies": 1.0, "rewards/chosen": 4.2476935386657715, "rewards/margins": 1.339388132095337, "rewards/rejected": 2.9083054065704346, "step": 2655 }, { "epoch": 0.59, "learning_rate": 8.282669212731511e-06, "logits/chosen": -1.6249513626098633, "logits/rejected": -1.6024441719055176, "logps/chosen": -78.54020690917969, "logps/rejected": -54.80992889404297, "loss": 0.5481, "rewards/accuracies": 0.0, "rewards/chosen": 2.0679054260253906, "rewards/margins": -0.6741242408752441, "rewards/rejected": 2.7420296669006348, "step": 2656 }, { "epoch": 0.59, "learning_rate": 8.281317054934306e-06, "logits/chosen": -1.3913546800613403, "logits/rejected": -1.3913546800613403, "logps/chosen": -87.2223892211914, "logps/rejected": -87.2223892211914, "loss": 0.7401, "rewards/accuracies": 0.0, "rewards/chosen": 4.0271220207214355, "rewards/margins": 0.0, "rewards/rejected": 4.0271220207214355, "step": 2657 }, { "epoch": 0.59, "learning_rate": 8.279964475495745e-06, "logits/chosen": -1.5549449920654297, "logits/rejected": -1.24844229221344, "logps/chosen": -49.337501525878906, "logps/rejected": -88.5431900024414, "loss": 0.4741, "rewards/accuracies": 0.0, "rewards/chosen": 5.883088111877441, "rewards/margins": -0.14546966552734375, "rewards/rejected": 6.028557777404785, "step": 2658 }, { "epoch": 0.59, "learning_rate": 8.278611474589635e-06, "logits/chosen": -1.6944869756698608, "logits/rejected": -1.7587695121765137, "logps/chosen": -45.380401611328125, "logps/rejected": -102.19525909423828, "loss": 2.3276, "rewards/accuracies": 0.0, "rewards/chosen": 0.9339996576309204, "rewards/margins": -3.476320743560791, "rewards/rejected": 4.410320281982422, "step": 2659 }, { "epoch": 0.59, "learning_rate": 8.277258052389834e-06, "logits/chosen": -1.5717501640319824, "logits/rejected": -1.5521143674850464, "logps/chosen": -73.61679077148438, "logps/rejected": -53.45924377441406, "loss": 0.9227, "rewards/accuracies": 0.0, "rewards/chosen": 2.1576218605041504, "rewards/margins": -0.5645499229431152, "rewards/rejected": 2.7221717834472656, "step": 2660 }, { "epoch": 0.59, "learning_rate": 8.27590420907025e-06, "logits/chosen": -1.5008544921875, "logits/rejected": -1.588008165359497, "logps/chosen": -62.63652038574219, "logps/rejected": -145.73306274414062, "loss": 1.7991, "rewards/accuracies": 0.0, "rewards/chosen": 2.506786346435547, "rewards/margins": -3.41414737701416, "rewards/rejected": 5.920933723449707, "step": 2661 }, { "epoch": 0.59, "learning_rate": 8.27454994480485e-06, "logits/chosen": -1.5832406282424927, "logits/rejected": -1.5807510614395142, "logps/chosen": -38.81659698486328, "logps/rejected": -83.1667709350586, "loss": 1.5711, "rewards/accuracies": 0.0, "rewards/chosen": 3.1112403869628906, "rewards/margins": -0.2848389148712158, "rewards/rejected": 3.3960793018341064, "step": 2662 }, { "epoch": 0.59, "learning_rate": 8.273195259767653e-06, "logits/chosen": -1.7437982559204102, "logits/rejected": -1.6358174085617065, "logps/chosen": -144.11904907226562, "logps/rejected": -89.77114868164062, "loss": 0.8731, "rewards/accuracies": 1.0, "rewards/chosen": 6.569064617156982, "rewards/margins": 2.7641665935516357, "rewards/rejected": 3.8048980236053467, "step": 2663 }, { "epoch": 0.59, "learning_rate": 8.271840154132736e-06, "logits/chosen": -1.8595218658447266, "logits/rejected": -1.7590755224227905, "logps/chosen": -76.38484191894531, "logps/rejected": -117.00324249267578, "loss": 1.3794, "rewards/accuracies": 0.0, "rewards/chosen": 5.300396919250488, "rewards/margins": -2.6929893493652344, "rewards/rejected": 7.993386268615723, "step": 2664 }, { "epoch": 0.59, "learning_rate": 8.270484628074222e-06, "logits/chosen": -1.4546523094177246, "logits/rejected": -1.3646584749221802, "logps/chosen": -52.210487365722656, "logps/rejected": -47.684600830078125, "loss": 1.8788, "rewards/accuracies": 1.0, "rewards/chosen": 3.66816782951355, "rewards/margins": 2.3415937423706055, "rewards/rejected": 1.3265739679336548, "step": 2665 }, { "epoch": 0.59, "learning_rate": 8.269128681766296e-06, "logits/chosen": -1.394117832183838, "logits/rejected": -1.468336820602417, "logps/chosen": -138.892822265625, "logps/rejected": -112.09988403320312, "loss": 1.6568, "rewards/accuracies": 0.0, "rewards/chosen": 5.040208339691162, "rewards/margins": -0.9543886184692383, "rewards/rejected": 5.9945969581604, "step": 2666 }, { "epoch": 0.59, "learning_rate": 8.267772315383195e-06, "logits/chosen": -1.4966078996658325, "logits/rejected": -1.409369945526123, "logps/chosen": -70.53815460205078, "logps/rejected": -26.72867202758789, "loss": 3.7929, "rewards/accuracies": 0.0, "rewards/chosen": 2.7687065601348877, "rewards/margins": -2.0092031955718994, "rewards/rejected": 4.777909755706787, "step": 2667 }, { "epoch": 0.59, "learning_rate": 8.266415529099205e-06, "logits/chosen": -1.5042235851287842, "logits/rejected": -1.439066767692566, "logps/chosen": -65.38973999023438, "logps/rejected": -39.23013687133789, "loss": 1.8167, "rewards/accuracies": 1.0, "rewards/chosen": 5.885514259338379, "rewards/margins": 1.3480315208435059, "rewards/rejected": 4.537482738494873, "step": 2668 }, { "epoch": 0.59, "learning_rate": 8.265058323088673e-06, "logits/chosen": -1.556586503982544, "logits/rejected": -1.5011038780212402, "logps/chosen": -55.971893310546875, "logps/rejected": -41.589622497558594, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 5.105494976043701, "rewards/margins": 3.00905442237854, "rewards/rejected": 2.096440553665161, "step": 2669 }, { "epoch": 0.59, "learning_rate": 8.263700697525994e-06, "logits/chosen": -1.9318987131118774, "logits/rejected": -1.9392036199569702, "logps/chosen": -102.80694580078125, "logps/rejected": -88.14439392089844, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 7.783819675445557, "rewards/margins": 3.8455963134765625, "rewards/rejected": 3.938223361968994, "step": 2670 }, { "epoch": 0.59, "learning_rate": 8.262342652585621e-06, "logits/chosen": -1.3572864532470703, "logits/rejected": -1.368818998336792, "logps/chosen": -64.83177947998047, "logps/rejected": -69.373046875, "loss": 0.4595, "rewards/accuracies": 0.0, "rewards/chosen": 1.7556679248809814, "rewards/margins": -0.4087684154510498, "rewards/rejected": 2.1644363403320312, "step": 2671 }, { "epoch": 0.59, "learning_rate": 8.260984188442063e-06, "logits/chosen": -1.8512318134307861, "logits/rejected": -1.7684986591339111, "logps/chosen": -176.4471893310547, "logps/rejected": -115.298095703125, "loss": 1.0668, "rewards/accuracies": 0.0, "rewards/chosen": 6.5638747215271, "rewards/margins": -1.9959063529968262, "rewards/rejected": 8.559781074523926, "step": 2672 }, { "epoch": 0.59, "learning_rate": 8.259625305269873e-06, "logits/chosen": -1.4938968420028687, "logits/rejected": -1.4512594938278198, "logps/chosen": -40.227020263671875, "logps/rejected": -32.43949890136719, "loss": 0.3253, "rewards/accuracies": 1.0, "rewards/chosen": 2.576098680496216, "rewards/margins": 0.09549403190612793, "rewards/rejected": 2.480604648590088, "step": 2673 }, { "epoch": 0.59, "learning_rate": 8.258266003243667e-06, "logits/chosen": -1.567346453666687, "logits/rejected": -1.567346453666687, "logps/chosen": -48.258914947509766, "logps/rejected": -48.258914947509766, "loss": 0.8643, "rewards/accuracies": 0.0, "rewards/chosen": 5.023301601409912, "rewards/margins": 0.0, "rewards/rejected": 5.023301601409912, "step": 2674 }, { "epoch": 0.59, "learning_rate": 8.256906282538113e-06, "logits/chosen": -1.520963191986084, "logits/rejected": -1.4585447311401367, "logps/chosen": -54.69964599609375, "logps/rejected": -58.81098175048828, "loss": 0.2419, "rewards/accuracies": 1.0, "rewards/chosen": 2.990926504135132, "rewards/margins": 1.2749565839767456, "rewards/rejected": 1.7159699201583862, "step": 2675 }, { "epoch": 0.59, "learning_rate": 8.25554614332793e-06, "logits/chosen": -1.4817285537719727, "logits/rejected": -1.1416648626327515, "logps/chosen": -51.53425598144531, "logps/rejected": -79.0908203125, "loss": 0.4507, "rewards/accuracies": 0.0, "rewards/chosen": 1.7003593444824219, "rewards/margins": -0.34078001976013184, "rewards/rejected": 2.0411393642425537, "step": 2676 }, { "epoch": 0.59, "learning_rate": 8.254185585787895e-06, "logits/chosen": -1.6520031690597534, "logits/rejected": -1.6163498163223267, "logps/chosen": -68.98880004882812, "logps/rejected": -48.62565612792969, "loss": 0.3914, "rewards/accuracies": 0.0, "rewards/chosen": 2.352678060531616, "rewards/margins": -0.16079473495483398, "rewards/rejected": 2.51347279548645, "step": 2677 }, { "epoch": 0.59, "learning_rate": 8.252824610092835e-06, "logits/chosen": -1.502378225326538, "logits/rejected": -1.486801266670227, "logps/chosen": -33.36513900756836, "logps/rejected": -24.962646484375, "loss": 0.415, "rewards/accuracies": 1.0, "rewards/chosen": 1.8024605512619019, "rewards/margins": 0.12214398384094238, "rewards/rejected": 1.6803165674209595, "step": 2678 }, { "epoch": 0.59, "learning_rate": 8.251463216417632e-06, "logits/chosen": -1.7988742589950562, "logits/rejected": -1.6646089553833008, "logps/chosen": -83.951416015625, "logps/rejected": -21.873294830322266, "loss": 0.3818, "rewards/accuracies": 1.0, "rewards/chosen": 6.284543037414551, "rewards/margins": 5.814297676086426, "rewards/rejected": 0.4702455699443817, "step": 2679 }, { "epoch": 0.59, "learning_rate": 8.250101404937223e-06, "logits/chosen": -1.5263471603393555, "logits/rejected": -1.4899530410766602, "logps/chosen": -63.13358688354492, "logps/rejected": -47.19538116455078, "loss": 0.9877, "rewards/accuracies": 0.0, "rewards/chosen": 3.13267183303833, "rewards/margins": -1.5276684761047363, "rewards/rejected": 4.660340309143066, "step": 2680 }, { "epoch": 0.59, "learning_rate": 8.248739175826594e-06, "logits/chosen": -1.6734583377838135, "logits/rejected": -1.6586261987686157, "logps/chosen": -131.73751831054688, "logps/rejected": -80.2318115234375, "loss": 0.1307, "rewards/accuracies": 1.0, "rewards/chosen": 5.612684726715088, "rewards/margins": 2.090045213699341, "rewards/rejected": 3.522639513015747, "step": 2681 }, { "epoch": 0.59, "learning_rate": 8.247376529260793e-06, "logits/chosen": -1.5873996019363403, "logits/rejected": -1.2328121662139893, "logps/chosen": -79.14579010009766, "logps/rejected": -57.11613845825195, "loss": 0.1785, "rewards/accuracies": 1.0, "rewards/chosen": 2.8734452724456787, "rewards/margins": 0.9136998653411865, "rewards/rejected": 1.9597454071044922, "step": 2682 }, { "epoch": 0.59, "learning_rate": 8.246013465414914e-06, "logits/chosen": -1.4249614477157593, "logits/rejected": -1.3376086950302124, "logps/chosen": -25.674686431884766, "logps/rejected": -4.17948579788208, "loss": 0.3651, "rewards/accuracies": 1.0, "rewards/chosen": 4.167909145355225, "rewards/margins": 3.278327703475952, "rewards/rejected": 0.8895815014839172, "step": 2683 }, { "epoch": 0.59, "learning_rate": 8.244649984464109e-06, "logits/chosen": -1.4032751321792603, "logits/rejected": -1.3621482849121094, "logps/chosen": -93.84918212890625, "logps/rejected": -55.2818603515625, "loss": 0.402, "rewards/accuracies": 0.0, "rewards/chosen": 5.027421474456787, "rewards/margins": -0.20059680938720703, "rewards/rejected": 5.228018283843994, "step": 2684 }, { "epoch": 0.59, "learning_rate": 8.243286086583577e-06, "logits/chosen": -1.5510549545288086, "logits/rejected": -1.575697898864746, "logps/chosen": -37.600486755371094, "logps/rejected": -64.22378540039062, "loss": 2.9028, "rewards/accuracies": 0.0, "rewards/chosen": 2.3448379039764404, "rewards/margins": -0.8044755458831787, "rewards/rejected": 3.149313449859619, "step": 2685 }, { "epoch": 0.59, "learning_rate": 8.241921771948583e-06, "logits/chosen": -1.3874285221099854, "logits/rejected": -1.3691517114639282, "logps/chosen": -33.45682907104492, "logps/rejected": -60.70258331298828, "loss": 1.7946, "rewards/accuracies": 0.0, "rewards/chosen": 2.2633137702941895, "rewards/margins": -1.5599567890167236, "rewards/rejected": 3.823270559310913, "step": 2686 }, { "epoch": 0.59, "learning_rate": 8.240557040734434e-06, "logits/chosen": -1.9317171573638916, "logits/rejected": -1.96486496925354, "logps/chosen": -117.94583892822266, "logps/rejected": -123.25430297851562, "loss": 1.5992, "rewards/accuracies": 0.0, "rewards/chosen": 6.73424768447876, "rewards/margins": -2.9535489082336426, "rewards/rejected": 9.687796592712402, "step": 2687 }, { "epoch": 0.59, "learning_rate": 8.239191893116494e-06, "logits/chosen": -1.4069955348968506, "logits/rejected": -1.4537023305892944, "logps/chosen": -57.88382339477539, "logps/rejected": -73.43987274169922, "loss": 3.0194, "rewards/accuracies": 0.0, "rewards/chosen": 1.6797802448272705, "rewards/margins": -6.011675834655762, "rewards/rejected": 7.691455841064453, "step": 2688 }, { "epoch": 0.6, "learning_rate": 8.237826329270183e-06, "logits/chosen": -1.863957405090332, "logits/rejected": -1.8457540273666382, "logps/chosen": -46.93341064453125, "logps/rejected": -60.834999084472656, "loss": 0.4319, "rewards/accuracies": 0.0, "rewards/chosen": 2.4361557960510254, "rewards/margins": -0.30278921127319336, "rewards/rejected": 2.7389450073242188, "step": 2689 }, { "epoch": 0.6, "learning_rate": 8.236460349370972e-06, "logits/chosen": -1.6573930978775024, "logits/rejected": -1.6439740657806396, "logps/chosen": -49.338134765625, "logps/rejected": -85.96236419677734, "loss": 0.4647, "rewards/accuracies": 0.0, "rewards/chosen": 3.1456305980682373, "rewards/margins": -0.38234567642211914, "rewards/rejected": 3.5279762744903564, "step": 2690 }, { "epoch": 0.6, "learning_rate": 8.235093953594387e-06, "logits/chosen": -1.5989704132080078, "logits/rejected": -1.5452250242233276, "logps/chosen": -85.0789794921875, "logps/rejected": -40.302734375, "loss": 0.3333, "rewards/accuracies": 1.0, "rewards/chosen": 8.108113288879395, "rewards/margins": 4.174679756164551, "rewards/rejected": 3.9334335327148438, "step": 2691 }, { "epoch": 0.6, "learning_rate": 8.233727142116007e-06, "logits/chosen": -1.5522197484970093, "logits/rejected": -1.5459038019180298, "logps/chosen": -44.25288009643555, "logps/rejected": -63.40350341796875, "loss": 1.3762, "rewards/accuracies": 1.0, "rewards/chosen": 4.587433338165283, "rewards/margins": 2.102787494659424, "rewards/rejected": 2.4846458435058594, "step": 2692 }, { "epoch": 0.6, "learning_rate": 8.232359915111462e-06, "logits/chosen": -1.6896039247512817, "logits/rejected": -1.6608351469039917, "logps/chosen": -93.83076477050781, "logps/rejected": -105.75892639160156, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 8.870757102966309, "rewards/margins": 2.745361804962158, "rewards/rejected": 6.12539529800415, "step": 2693 }, { "epoch": 0.6, "learning_rate": 8.230992272756438e-06, "logits/chosen": -1.5837467908859253, "logits/rejected": -1.5634684562683105, "logps/chosen": -60.6511116027832, "logps/rejected": -58.13979721069336, "loss": 1.3452, "rewards/accuracies": 0.0, "rewards/chosen": 3.471238374710083, "rewards/margins": -1.212395429611206, "rewards/rejected": 4.683633804321289, "step": 2694 }, { "epoch": 0.6, "learning_rate": 8.229624215226675e-06, "logits/chosen": -1.9172409772872925, "logits/rejected": -1.9019619226455688, "logps/chosen": -85.7469253540039, "logps/rejected": -132.8540802001953, "loss": 3.501, "rewards/accuracies": 0.0, "rewards/chosen": 2.9326226711273193, "rewards/margins": -5.73661994934082, "rewards/rejected": 8.669242858886719, "step": 2695 }, { "epoch": 0.6, "learning_rate": 8.228255742697962e-06, "logits/chosen": -1.375288724899292, "logits/rejected": -1.3442223072052002, "logps/chosen": -63.170589447021484, "logps/rejected": -105.75224304199219, "loss": 0.6489, "rewards/accuracies": 0.0, "rewards/chosen": 5.6403021812438965, "rewards/margins": -0.10503625869750977, "rewards/rejected": 5.745338439941406, "step": 2696 }, { "epoch": 0.6, "learning_rate": 8.226886855346148e-06, "logits/chosen": -1.3970087766647339, "logits/rejected": -1.3587485551834106, "logps/chosen": -59.63069152832031, "logps/rejected": -94.64236450195312, "loss": 1.2998, "rewards/accuracies": 0.0, "rewards/chosen": 2.5470383167266846, "rewards/margins": -2.271080255508423, "rewards/rejected": 4.818118572235107, "step": 2697 }, { "epoch": 0.6, "learning_rate": 8.225517553347132e-06, "logits/chosen": -1.60447359085083, "logits/rejected": -1.5703545808792114, "logps/chosen": -77.88934326171875, "logps/rejected": -68.49093627929688, "loss": 0.3414, "rewards/accuracies": 1.0, "rewards/chosen": 3.828456163406372, "rewards/margins": 0.6120650768280029, "rewards/rejected": 3.216391086578369, "step": 2698 }, { "epoch": 0.6, "learning_rate": 8.224147836876861e-06, "logits/chosen": -1.5435079336166382, "logits/rejected": -1.502278447151184, "logps/chosen": -62.009002685546875, "logps/rejected": -103.78744506835938, "loss": 0.1545, "rewards/accuracies": 1.0, "rewards/chosen": 5.048547267913818, "rewards/margins": 1.3256926536560059, "rewards/rejected": 3.7228546142578125, "step": 2699 }, { "epoch": 0.6, "learning_rate": 8.222777706111345e-06, "logits/chosen": -1.7633641958236694, "logits/rejected": -1.7244688272476196, "logps/chosen": -87.93449401855469, "logps/rejected": -35.66353988647461, "loss": 0.1893, "rewards/accuracies": 1.0, "rewards/chosen": 4.774348735809326, "rewards/margins": 1.0032966136932373, "rewards/rejected": 3.771052122116089, "step": 2700 }, { "epoch": 0.6, "learning_rate": 8.221407161226641e-06, "logits/chosen": -1.3594543933868408, "logits/rejected": -1.3594543933868408, "logps/chosen": -48.89370346069336, "logps/rejected": -48.89370346069336, "loss": 0.3488, "rewards/accuracies": 0.0, "rewards/chosen": 5.495523452758789, "rewards/margins": 0.0, "rewards/rejected": 5.495523452758789, "step": 2701 }, { "epoch": 0.6, "learning_rate": 8.220036202398861e-06, "logits/chosen": -1.8131110668182373, "logits/rejected": -1.7958446741104126, "logps/chosen": -92.062744140625, "logps/rejected": -88.21322631835938, "loss": 0.2211, "rewards/accuracies": 1.0, "rewards/chosen": 6.442936897277832, "rewards/margins": 0.5988879203796387, "rewards/rejected": 5.844048976898193, "step": 2702 }, { "epoch": 0.6, "learning_rate": 8.21866482980417e-06, "logits/chosen": -1.6007845401763916, "logits/rejected": -1.5439954996109009, "logps/chosen": -90.45890808105469, "logps/rejected": -62.59764862060547, "loss": 1.379, "rewards/accuracies": 1.0, "rewards/chosen": 4.569053649902344, "rewards/margins": 1.6393425464630127, "rewards/rejected": 2.929711103439331, "step": 2703 }, { "epoch": 0.6, "learning_rate": 8.217293043618786e-06, "logits/chosen": -1.7967921495437622, "logits/rejected": -1.7743806838989258, "logps/chosen": -52.16902160644531, "logps/rejected": -55.097164154052734, "loss": 1.2038, "rewards/accuracies": 1.0, "rewards/chosen": 7.160122871398926, "rewards/margins": 2.735964775085449, "rewards/rejected": 4.424158096313477, "step": 2704 }, { "epoch": 0.6, "learning_rate": 8.21592084401898e-06, "logits/chosen": -1.3081669807434082, "logits/rejected": -1.3081669807434082, "logps/chosen": -38.88335037231445, "logps/rejected": -38.88335037231445, "loss": 1.6615, "rewards/accuracies": 0.0, "rewards/chosen": 0.6409958004951477, "rewards/margins": 0.0, "rewards/rejected": 0.6409958004951477, "step": 2705 }, { "epoch": 0.6, "learning_rate": 8.214548231181077e-06, "logits/chosen": -1.6379402875900269, "logits/rejected": -1.6110824346542358, "logps/chosen": -52.857208251953125, "logps/rejected": -48.084468841552734, "loss": 0.6555, "rewards/accuracies": 1.0, "rewards/chosen": 2.428093671798706, "rewards/margins": 0.1569209098815918, "rewards/rejected": 2.2711727619171143, "step": 2706 }, { "epoch": 0.6, "learning_rate": 8.213175205281451e-06, "logits/chosen": -1.476812481880188, "logits/rejected": -1.3616546392440796, "logps/chosen": -92.09486389160156, "logps/rejected": -67.53673553466797, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": 6.486939907073975, "rewards/margins": 2.863091230392456, "rewards/rejected": 3.6238486766815186, "step": 2707 }, { "epoch": 0.6, "learning_rate": 8.211801766496537e-06, "logits/chosen": -1.5963460206985474, "logits/rejected": -1.5862460136413574, "logps/chosen": -53.06591796875, "logps/rejected": -25.230939865112305, "loss": 0.6372, "rewards/accuracies": 1.0, "rewards/chosen": 3.431591749191284, "rewards/margins": 0.1979055404663086, "rewards/rejected": 3.2336862087249756, "step": 2708 }, { "epoch": 0.6, "learning_rate": 8.210427915002819e-06, "logits/chosen": -1.4343559741973877, "logits/rejected": -1.4343559741973877, "logps/chosen": -42.879486083984375, "logps/rejected": -42.879486083984375, "loss": 0.7377, "rewards/accuracies": 0.0, "rewards/chosen": 2.15019154548645, "rewards/margins": 0.0, "rewards/rejected": 2.15019154548645, "step": 2709 }, { "epoch": 0.6, "learning_rate": 8.20905365097683e-06, "logits/chosen": -1.7306594848632812, "logits/rejected": -1.683806300163269, "logps/chosen": -68.92378997802734, "logps/rejected": -62.99537658691406, "loss": 1.0963, "rewards/accuracies": 0.0, "rewards/chosen": 2.6335244178771973, "rewards/margins": -2.0740737915039062, "rewards/rejected": 4.7075982093811035, "step": 2710 }, { "epoch": 0.6, "learning_rate": 8.20767897459516e-06, "logits/chosen": -1.446484923362732, "logits/rejected": -1.4537917375564575, "logps/chosen": -72.08633422851562, "logps/rejected": -69.53858947753906, "loss": 0.4543, "rewards/accuracies": 0.0, "rewards/chosen": 3.0766189098358154, "rewards/margins": -0.3864471912384033, "rewards/rejected": 3.4630661010742188, "step": 2711 }, { "epoch": 0.6, "learning_rate": 8.206303886034455e-06, "logits/chosen": -1.543259620666504, "logits/rejected": -1.4868760108947754, "logps/chosen": -63.052146911621094, "logps/rejected": -44.622474670410156, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": 6.129186153411865, "rewards/margins": 2.126065731048584, "rewards/rejected": 4.003120422363281, "step": 2712 }, { "epoch": 0.6, "learning_rate": 8.204928385471406e-06, "logits/chosen": -1.5512653589248657, "logits/rejected": -1.5348602533340454, "logps/chosen": -34.646636962890625, "logps/rejected": -23.89083480834961, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 2.0361626148223877, "rewards/margins": -0.01597452163696289, "rewards/rejected": 2.0521371364593506, "step": 2713 }, { "epoch": 0.6, "learning_rate": 8.203552473082766e-06, "logits/chosen": -1.533408761024475, "logits/rejected": -1.4450815916061401, "logps/chosen": -128.70248413085938, "logps/rejected": -87.51270294189453, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": 5.6282501220703125, "rewards/margins": 3.2126059532165527, "rewards/rejected": 2.4156441688537598, "step": 2714 }, { "epoch": 0.6, "learning_rate": 8.202176149045334e-06, "logits/chosen": -1.5870500802993774, "logits/rejected": -1.5583734512329102, "logps/chosen": -31.437332153320312, "logps/rejected": -50.006324768066406, "loss": 1.1822, "rewards/accuracies": 0.0, "rewards/chosen": 2.6159050464630127, "rewards/margins": -1.7380235195159912, "rewards/rejected": 4.353928565979004, "step": 2715 }, { "epoch": 0.6, "learning_rate": 8.200799413535962e-06, "logits/chosen": -1.7383934259414673, "logits/rejected": -1.4588807821273804, "logps/chosen": -173.70217895507812, "logps/rejected": -24.02117919921875, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 7.113922119140625, "rewards/margins": 6.762247085571289, "rewards/rejected": 0.35167503356933594, "step": 2716 }, { "epoch": 0.6, "learning_rate": 8.199422266731563e-06, "logits/chosen": -1.5764364004135132, "logits/rejected": -1.6903504133224487, "logps/chosen": -41.63605499267578, "logps/rejected": -92.38345336914062, "loss": 2.9658, "rewards/accuracies": 0.0, "rewards/chosen": 2.879209280014038, "rewards/margins": -4.773443222045898, "rewards/rejected": 7.652652263641357, "step": 2717 }, { "epoch": 0.6, "learning_rate": 8.198044708809094e-06, "logits/chosen": -1.5468569993972778, "logits/rejected": -1.5287165641784668, "logps/chosen": -147.80056762695312, "logps/rejected": -125.86465454101562, "loss": 0.4031, "rewards/accuracies": 1.0, "rewards/chosen": 8.745898246765137, "rewards/margins": 0.7793440818786621, "rewards/rejected": 7.966554164886475, "step": 2718 }, { "epoch": 0.6, "learning_rate": 8.196666739945566e-06, "logits/chosen": -1.4980887174606323, "logits/rejected": -1.5336846113204956, "logps/chosen": -76.06275939941406, "logps/rejected": -48.952335357666016, "loss": 1.9425, "rewards/accuracies": 0.0, "rewards/chosen": 1.71099853515625, "rewards/margins": -1.1110026836395264, "rewards/rejected": 2.8220012187957764, "step": 2719 }, { "epoch": 0.6, "learning_rate": 8.195288360318048e-06, "logits/chosen": -1.482010841369629, "logits/rejected": -1.4608018398284912, "logps/chosen": -114.78465270996094, "logps/rejected": -105.98919677734375, "loss": 2.6573, "rewards/accuracies": 0.0, "rewards/chosen": 7.7556471824646, "rewards/margins": -2.109245777130127, "rewards/rejected": 9.864892959594727, "step": 2720 }, { "epoch": 0.6, "learning_rate": 8.193909570103656e-06, "logits/chosen": -1.6026753187179565, "logits/rejected": -1.4021151065826416, "logps/chosen": -88.62205505371094, "logps/rejected": -62.647708892822266, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 6.898907661437988, "rewards/margins": 4.814774036407471, "rewards/rejected": 2.0841336250305176, "step": 2721 }, { "epoch": 0.6, "learning_rate": 8.192530369479562e-06, "logits/chosen": -1.4306840896606445, "logits/rejected": -1.3222591876983643, "logps/chosen": -110.65715789794922, "logps/rejected": -50.34315490722656, "loss": 0.1821, "rewards/accuracies": 1.0, "rewards/chosen": 5.746579647064209, "rewards/margins": 3.143488883972168, "rewards/rejected": 2.603090763092041, "step": 2722 }, { "epoch": 0.6, "learning_rate": 8.191150758622991e-06, "logits/chosen": -1.5827393531799316, "logits/rejected": -1.5861608982086182, "logps/chosen": -66.41258239746094, "logps/rejected": -69.03057861328125, "loss": 1.3018, "rewards/accuracies": 0.0, "rewards/chosen": 3.4355950355529785, "rewards/margins": -2.5185980796813965, "rewards/rejected": 5.954193115234375, "step": 2723 }, { "epoch": 0.6, "learning_rate": 8.189770737711218e-06, "logits/chosen": -1.4861356019973755, "logits/rejected": -1.4683767557144165, "logps/chosen": -50.847137451171875, "logps/rejected": -74.89317321777344, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": 4.022810459136963, "rewards/margins": 2.1817665100097656, "rewards/rejected": 1.8410438299179077, "step": 2724 }, { "epoch": 0.6, "learning_rate": 8.188390306921574e-06, "logits/chosen": -1.6285455226898193, "logits/rejected": -1.5589262247085571, "logps/chosen": -142.261962890625, "logps/rejected": -106.25150299072266, "loss": 0.1434, "rewards/accuracies": 1.0, "rewards/chosen": 9.219781875610352, "rewards/margins": 5.696849822998047, "rewards/rejected": 3.5229318141937256, "step": 2725 }, { "epoch": 0.6, "learning_rate": 8.18700946643144e-06, "logits/chosen": -1.5395466089248657, "logits/rejected": -1.4036611318588257, "logps/chosen": -122.68050384521484, "logps/rejected": -63.190673828125, "loss": 0.0705, "rewards/accuracies": 1.0, "rewards/chosen": 6.100203990936279, "rewards/margins": 3.8506076335906982, "rewards/rejected": 2.249596357345581, "step": 2726 }, { "epoch": 0.6, "learning_rate": 8.18562821641825e-06, "logits/chosen": -1.7672173976898193, "logits/rejected": -1.7650045156478882, "logps/chosen": -72.79185485839844, "logps/rejected": -77.17666625976562, "loss": 1.7308, "rewards/accuracies": 0.0, "rewards/chosen": 1.9966201782226562, "rewards/margins": -3.429154872894287, "rewards/rejected": 5.425775051116943, "step": 2727 }, { "epoch": 0.6, "learning_rate": 8.184246557059493e-06, "logits/chosen": -1.4339494705200195, "logits/rejected": -1.3268871307373047, "logps/chosen": -103.23757934570312, "logps/rejected": -56.175315856933594, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": 6.2297258377075195, "rewards/margins": 4.0727715492248535, "rewards/rejected": 2.156954288482666, "step": 2728 }, { "epoch": 0.6, "learning_rate": 8.182864488532707e-06, "logits/chosen": -1.7713775634765625, "logits/rejected": -1.7783517837524414, "logps/chosen": -97.22349548339844, "logps/rejected": -66.98808288574219, "loss": 0.2119, "rewards/accuracies": 1.0, "rewards/chosen": 5.89865255355835, "rewards/margins": 1.1324443817138672, "rewards/rejected": 4.766208171844482, "step": 2729 }, { "epoch": 0.6, "learning_rate": 8.181482011015488e-06, "logits/chosen": -1.5414671897888184, "logits/rejected": -1.4579013586044312, "logps/chosen": -85.04695892333984, "logps/rejected": -70.3958969116211, "loss": 0.6711, "rewards/accuracies": 1.0, "rewards/chosen": 3.820599317550659, "rewards/margins": 1.8621329069137573, "rewards/rejected": 1.9584664106369019, "step": 2730 }, { "epoch": 0.6, "learning_rate": 8.180099124685476e-06, "logits/chosen": -1.5474790334701538, "logits/rejected": -1.557141661643982, "logps/chosen": -67.61002349853516, "logps/rejected": -84.6275863647461, "loss": 0.8484, "rewards/accuracies": 0.0, "rewards/chosen": 2.340216875076294, "rewards/margins": -1.0151267051696777, "rewards/rejected": 3.3553435802459717, "step": 2731 }, { "epoch": 0.6, "learning_rate": 8.178715829720374e-06, "logits/chosen": -1.2582250833511353, "logits/rejected": -1.2582250833511353, "logps/chosen": -62.39357376098633, "logps/rejected": -62.39357376098633, "loss": 0.8422, "rewards/accuracies": 0.0, "rewards/chosen": 6.051773548126221, "rewards/margins": 0.0, "rewards/rejected": 6.051773548126221, "step": 2732 }, { "epoch": 0.6, "learning_rate": 8.177332126297928e-06, "logits/chosen": -1.5377373695373535, "logits/rejected": -1.2953412532806396, "logps/chosen": -181.56500244140625, "logps/rejected": -33.28827667236328, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 7.282525539398193, "rewards/margins": 7.66080904006958, "rewards/rejected": -0.3782833218574524, "step": 2733 }, { "epoch": 0.61, "learning_rate": 8.175948014595942e-06, "logits/chosen": -1.4899979829788208, "logits/rejected": -1.454799771308899, "logps/chosen": -72.59815979003906, "logps/rejected": -63.339080810546875, "loss": 1.4197, "rewards/accuracies": 0.0, "rewards/chosen": 2.7247283458709717, "rewards/margins": -1.9183404445648193, "rewards/rejected": 4.643068790435791, "step": 2734 }, { "epoch": 0.61, "learning_rate": 8.17456349479227e-06, "logits/chosen": -1.5487948656082153, "logits/rejected": -1.5761710405349731, "logps/chosen": -61.13427734375, "logps/rejected": -142.33506774902344, "loss": 0.6891, "rewards/accuracies": 0.0, "rewards/chosen": 3.048837423324585, "rewards/margins": -1.0823562145233154, "rewards/rejected": 4.1311936378479, "step": 2735 }, { "epoch": 0.61, "learning_rate": 8.17317856706482e-06, "logits/chosen": -1.7332658767700195, "logits/rejected": -1.7425968647003174, "logps/chosen": -14.259859085083008, "logps/rejected": -17.941608428955078, "loss": 1.0544, "rewards/accuracies": 0.0, "rewards/chosen": 0.5145296454429626, "rewards/margins": -1.1751070022583008, "rewards/rejected": 1.6896365880966187, "step": 2736 }, { "epoch": 0.61, "learning_rate": 8.171793231591553e-06, "logits/chosen": -1.3185774087905884, "logits/rejected": -1.1769119501113892, "logps/chosen": -57.224945068359375, "logps/rejected": -21.93307876586914, "loss": 0.2355, "rewards/accuracies": 1.0, "rewards/chosen": 3.7132568359375, "rewards/margins": 2.1729846000671387, "rewards/rejected": 1.5402721166610718, "step": 2737 }, { "epoch": 0.61, "learning_rate": 8.170407488550482e-06, "logits/chosen": -1.5871154069900513, "logits/rejected": -1.569204568862915, "logps/chosen": -67.74728393554688, "logps/rejected": -46.274200439453125, "loss": 0.9855, "rewards/accuracies": 0.0, "rewards/chosen": 2.2822976112365723, "rewards/margins": -0.46294236183166504, "rewards/rejected": 2.7452399730682373, "step": 2738 }, { "epoch": 0.61, "learning_rate": 8.169021338119669e-06, "logits/chosen": -1.353730320930481, "logits/rejected": -1.374518871307373, "logps/chosen": -35.830039978027344, "logps/rejected": -71.986328125, "loss": 2.5801, "rewards/accuracies": 0.0, "rewards/chosen": 2.648519992828369, "rewards/margins": -3.2151565551757812, "rewards/rejected": 5.86367654800415, "step": 2739 }, { "epoch": 0.61, "learning_rate": 8.167634780477231e-06, "logits/chosen": -1.5213016271591187, "logits/rejected": -1.53549325466156, "logps/chosen": -58.7636833190918, "logps/rejected": -53.13463592529297, "loss": 0.4557, "rewards/accuracies": 0.0, "rewards/chosen": 2.7325704097747803, "rewards/margins": -0.07428622245788574, "rewards/rejected": 2.806856632232666, "step": 2740 }, { "epoch": 0.61, "learning_rate": 8.16624781580134e-06, "logits/chosen": -1.4805107116699219, "logits/rejected": -1.4258748292922974, "logps/chosen": -57.02809143066406, "logps/rejected": -35.50132751464844, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 6.918731212615967, "rewards/margins": 3.5000956058502197, "rewards/rejected": 3.418635606765747, "step": 2741 }, { "epoch": 0.61, "learning_rate": 8.164860444270217e-06, "logits/chosen": -1.3296302556991577, "logits/rejected": -1.382096767425537, "logps/chosen": -188.66990661621094, "logps/rejected": -48.94925308227539, "loss": 0.1711, "rewards/accuracies": 1.0, "rewards/chosen": 7.210157871246338, "rewards/margins": 4.983762741088867, "rewards/rejected": 2.2263951301574707, "step": 2742 }, { "epoch": 0.61, "learning_rate": 8.163472666062133e-06, "logits/chosen": -1.63300621509552, "logits/rejected": -1.4307975769042969, "logps/chosen": -137.2464141845703, "logps/rejected": -45.19043731689453, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": 6.8332977294921875, "rewards/margins": 3.1863982677459717, "rewards/rejected": 3.646899461746216, "step": 2743 }, { "epoch": 0.61, "learning_rate": 8.162084481355418e-06, "logits/chosen": -1.6130661964416504, "logits/rejected": -1.538392424583435, "logps/chosen": -181.22035217285156, "logps/rejected": -74.6519775390625, "loss": 0.4989, "rewards/accuracies": 1.0, "rewards/chosen": 4.033012390136719, "rewards/margins": 0.22426152229309082, "rewards/rejected": 3.808750867843628, "step": 2744 }, { "epoch": 0.61, "learning_rate": 8.160695890328448e-06, "logits/chosen": -1.2338476181030273, "logits/rejected": -1.2679834365844727, "logps/chosen": -103.96354675292969, "logps/rejected": -113.45777130126953, "loss": 1.6045, "rewards/accuracies": 0.0, "rewards/chosen": 4.100940227508545, "rewards/margins": -3.1675238609313965, "rewards/rejected": 7.268464088439941, "step": 2745 }, { "epoch": 0.61, "learning_rate": 8.159306893159652e-06, "logits/chosen": -1.647325038909912, "logits/rejected": -1.0531593561172485, "logps/chosen": -184.522216796875, "logps/rejected": -50.252479553222656, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 5.898007392883301, "rewards/margins": 2.52580189704895, "rewards/rejected": 3.3722054958343506, "step": 2746 }, { "epoch": 0.61, "learning_rate": 8.157917490027518e-06, "logits/chosen": -1.6164788007736206, "logits/rejected": -1.6284679174423218, "logps/chosen": -78.50806427001953, "logps/rejected": -71.36822509765625, "loss": 1.3368, "rewards/accuracies": 0.0, "rewards/chosen": 3.5418832302093506, "rewards/margins": -2.3681695461273193, "rewards/rejected": 5.91005277633667, "step": 2747 }, { "epoch": 0.61, "learning_rate": 8.156527681110576e-06, "logits/chosen": -1.3332599401474, "logits/rejected": -1.3332599401474, "logps/chosen": -78.56512451171875, "logps/rejected": -78.56512451171875, "loss": 0.356, "rewards/accuracies": 0.0, "rewards/chosen": 3.2180848121643066, "rewards/margins": 0.0, "rewards/rejected": 3.2180848121643066, "step": 2748 }, { "epoch": 0.61, "learning_rate": 8.155137466587415e-06, "logits/chosen": -1.9036518335342407, "logits/rejected": -1.7877284288406372, "logps/chosen": -85.43036651611328, "logps/rejected": -68.68862915039062, "loss": 0.7943, "rewards/accuracies": 1.0, "rewards/chosen": 4.090590953826904, "rewards/margins": 0.7023394107818604, "rewards/rejected": 3.388251543045044, "step": 2749 }, { "epoch": 0.61, "learning_rate": 8.153746846636675e-06, "logits/chosen": -1.5176998376846313, "logits/rejected": -1.5176998376846313, "logps/chosen": -46.14695739746094, "logps/rejected": -46.14695739746094, "loss": 0.3499, "rewards/accuracies": 0.0, "rewards/chosen": 1.8537399768829346, "rewards/margins": 0.0, "rewards/rejected": 1.8537399768829346, "step": 2750 }, { "epoch": 0.61, "learning_rate": 8.152355821437048e-06, "logits/chosen": -1.5449376106262207, "logits/rejected": -1.6279083490371704, "logps/chosen": -38.7930908203125, "logps/rejected": -101.96341705322266, "loss": 1.7101, "rewards/accuracies": 0.0, "rewards/chosen": 3.834332227706909, "rewards/margins": -3.3151557445526123, "rewards/rejected": 7.1494879722595215, "step": 2751 }, { "epoch": 0.61, "learning_rate": 8.150964391167273e-06, "logits/chosen": -1.7571437358856201, "logits/rejected": -1.6927434206008911, "logps/chosen": -139.96841430664062, "logps/rejected": -151.19754028320312, "loss": 0.7127, "rewards/accuracies": 0.0, "rewards/chosen": 7.170881748199463, "rewards/margins": -1.0327191352844238, "rewards/rejected": 8.203600883483887, "step": 2752 }, { "epoch": 0.61, "learning_rate": 8.149572556006151e-06, "logits/chosen": -1.6479816436767578, "logits/rejected": -1.623679757118225, "logps/chosen": -65.5948257446289, "logps/rejected": -57.58263397216797, "loss": 2.6177, "rewards/accuracies": 0.0, "rewards/chosen": 1.4978049993515015, "rewards/margins": -0.6732667684555054, "rewards/rejected": 2.171071767807007, "step": 2753 }, { "epoch": 0.61, "learning_rate": 8.148180316132526e-06, "logits/chosen": -1.284844994544983, "logits/rejected": -1.2716118097305298, "logps/chosen": -79.61640930175781, "logps/rejected": -54.48005676269531, "loss": 3.1944, "rewards/accuracies": 0.0, "rewards/chosen": 2.7600204944610596, "rewards/margins": -1.2025146484375, "rewards/rejected": 3.9625351428985596, "step": 2754 }, { "epoch": 0.61, "learning_rate": 8.146787671725299e-06, "logits/chosen": -1.969210147857666, "logits/rejected": -1.882290005683899, "logps/chosen": -86.89041900634766, "logps/rejected": -65.75164794921875, "loss": 0.094, "rewards/accuracies": 1.0, "rewards/chosen": 7.038370609283447, "rewards/margins": 2.034651756286621, "rewards/rejected": 5.003718852996826, "step": 2755 }, { "epoch": 0.61, "learning_rate": 8.14539462296342e-06, "logits/chosen": -1.4872504472732544, "logits/rejected": -1.468088984489441, "logps/chosen": -53.63520812988281, "logps/rejected": -104.09551239013672, "loss": 0.8913, "rewards/accuracies": 0.0, "rewards/chosen": 4.697572231292725, "rewards/margins": -0.5687403678894043, "rewards/rejected": 5.266312599182129, "step": 2756 }, { "epoch": 0.61, "learning_rate": 8.144001170025894e-06, "logits/chosen": -1.3542001247406006, "logits/rejected": -1.3542001247406006, "logps/chosen": -57.5133056640625, "logps/rejected": -57.5133056640625, "loss": 0.5854, "rewards/accuracies": 0.0, "rewards/chosen": 4.059206485748291, "rewards/margins": 0.0, "rewards/rejected": 4.059206485748291, "step": 2757 }, { "epoch": 0.61, "learning_rate": 8.142607313091775e-06, "logits/chosen": -1.5432294607162476, "logits/rejected": -1.4238959550857544, "logps/chosen": -93.9497299194336, "logps/rejected": -71.23846435546875, "loss": 0.8067, "rewards/accuracies": 1.0, "rewards/chosen": 5.041169166564941, "rewards/margins": 1.177588701248169, "rewards/rejected": 3.8635804653167725, "step": 2758 }, { "epoch": 0.61, "learning_rate": 8.141213052340171e-06, "logits/chosen": -1.984564185142517, "logits/rejected": -1.9041415452957153, "logps/chosen": -91.39599609375, "logps/rejected": -118.66065979003906, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 6.140038967132568, "rewards/margins": -0.16837024688720703, "rewards/rejected": 6.308409214019775, "step": 2759 }, { "epoch": 0.61, "learning_rate": 8.13981838795024e-06, "logits/chosen": -1.3298546075820923, "logits/rejected": -1.2807502746582031, "logps/chosen": -85.1214599609375, "logps/rejected": -127.35066223144531, "loss": 1.004, "rewards/accuracies": 0.0, "rewards/chosen": 5.068301677703857, "rewards/margins": -1.0136394500732422, "rewards/rejected": 6.0819411277771, "step": 2760 }, { "epoch": 0.61, "learning_rate": 8.138423320101196e-06, "logits/chosen": -1.909763216972351, "logits/rejected": -1.8638908863067627, "logps/chosen": -55.48798370361328, "logps/rejected": -48.43020248413086, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 7.348331451416016, "rewards/margins": 3.9913089275360107, "rewards/rejected": 3.357022523880005, "step": 2761 }, { "epoch": 0.61, "learning_rate": 8.1370278489723e-06, "logits/chosen": -1.3892698287963867, "logits/rejected": -1.2804994583129883, "logps/chosen": -104.6954345703125, "logps/rejected": -90.49273681640625, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 5.345358371734619, "rewards/margins": 3.2192771434783936, "rewards/rejected": 2.1260812282562256, "step": 2762 }, { "epoch": 0.61, "learning_rate": 8.135631974742863e-06, "logits/chosen": -1.4736824035644531, "logits/rejected": -1.467758297920227, "logps/chosen": -36.51751708984375, "logps/rejected": -51.5915412902832, "loss": 1.2501, "rewards/accuracies": 0.0, "rewards/chosen": 2.369140625, "rewards/margins": -0.6506848335266113, "rewards/rejected": 3.0198254585266113, "step": 2763 }, { "epoch": 0.61, "learning_rate": 8.13423569759226e-06, "logits/chosen": -1.4581705331802368, "logits/rejected": -1.5210331678390503, "logps/chosen": -74.45185089111328, "logps/rejected": -112.1184310913086, "loss": 1.9003, "rewards/accuracies": 0.0, "rewards/chosen": 2.8225700855255127, "rewards/margins": -3.7759416103363037, "rewards/rejected": 6.598511695861816, "step": 2764 }, { "epoch": 0.61, "learning_rate": 8.132839017699901e-06, "logits/chosen": -1.7543452978134155, "logits/rejected": -1.7117700576782227, "logps/chosen": -139.53762817382812, "logps/rejected": -140.619384765625, "loss": 1.3118, "rewards/accuracies": 1.0, "rewards/chosen": 7.468313694000244, "rewards/margins": 0.6695737838745117, "rewards/rejected": 6.798739910125732, "step": 2765 }, { "epoch": 0.61, "learning_rate": 8.131441935245261e-06, "logits/chosen": -1.2883703708648682, "logits/rejected": -1.2538378238677979, "logps/chosen": -54.031951904296875, "logps/rejected": -72.09349060058594, "loss": 1.2372, "rewards/accuracies": 0.0, "rewards/chosen": 3.475935459136963, "rewards/margins": -0.8997054100036621, "rewards/rejected": 4.375640869140625, "step": 2766 }, { "epoch": 0.61, "learning_rate": 8.13004445040786e-06, "logits/chosen": -1.5982105731964111, "logits/rejected": -1.6035946607589722, "logps/chosen": -89.7486801147461, "logps/rejected": -185.64598083496094, "loss": 1.3745, "rewards/accuracies": 1.0, "rewards/chosen": 7.2434916496276855, "rewards/margins": 1.0066871643066406, "rewards/rejected": 6.236804485321045, "step": 2767 }, { "epoch": 0.61, "learning_rate": 8.128646563367271e-06, "logits/chosen": -1.2045412063598633, "logits/rejected": -1.1943295001983643, "logps/chosen": -22.42084503173828, "logps/rejected": -50.76465606689453, "loss": 0.5741, "rewards/accuracies": 1.0, "rewards/chosen": 2.2140114307403564, "rewards/margins": 0.5511764287948608, "rewards/rejected": 1.6628350019454956, "step": 2768 }, { "epoch": 0.61, "learning_rate": 8.12724827430312e-06, "logits/chosen": -1.6862823963165283, "logits/rejected": -1.696521282196045, "logps/chosen": -56.43815612792969, "logps/rejected": -69.462890625, "loss": 0.334, "rewards/accuracies": 1.0, "rewards/chosen": 3.5492546558380127, "rewards/margins": 0.7685303688049316, "rewards/rejected": 2.780724287033081, "step": 2769 }, { "epoch": 0.61, "learning_rate": 8.125849583395083e-06, "logits/chosen": -1.552520751953125, "logits/rejected": -1.5324506759643555, "logps/chosen": -62.68719482421875, "logps/rejected": -94.56333923339844, "loss": 1.7118, "rewards/accuracies": 0.0, "rewards/chosen": 3.060260772705078, "rewards/margins": -2.783832550048828, "rewards/rejected": 5.844093322753906, "step": 2770 }, { "epoch": 0.61, "learning_rate": 8.124450490822889e-06, "logits/chosen": -1.6234159469604492, "logits/rejected": -1.6509543657302856, "logps/chosen": -28.711181640625, "logps/rejected": -112.68339538574219, "loss": 0.0706, "rewards/accuracies": 1.0, "rewards/chosen": 4.082417964935303, "rewards/margins": 1.955594539642334, "rewards/rejected": 2.1268234252929688, "step": 2771 }, { "epoch": 0.61, "learning_rate": 8.123050996766317e-06, "logits/chosen": -1.7963392734527588, "logits/rejected": -1.7786322832107544, "logps/chosen": -82.33145141601562, "logps/rejected": -183.15736389160156, "loss": 1.8876, "rewards/accuracies": 0.0, "rewards/chosen": 1.755597710609436, "rewards/margins": -3.748289108276367, "rewards/rejected": 5.503886699676514, "step": 2772 }, { "epoch": 0.61, "learning_rate": 8.121651101405202e-06, "logits/chosen": -1.5535486936569214, "logits/rejected": -1.4726275205612183, "logps/chosen": -86.51341247558594, "logps/rejected": -78.53955078125, "loss": 0.1436, "rewards/accuracies": 1.0, "rewards/chosen": 5.511003017425537, "rewards/margins": 1.2301740646362305, "rewards/rejected": 4.280828952789307, "step": 2773 }, { "epoch": 0.61, "learning_rate": 8.120250804919424e-06, "logits/chosen": -1.43021559715271, "logits/rejected": -1.296510100364685, "logps/chosen": -78.44092559814453, "logps/rejected": -48.3381462097168, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 6.82009220123291, "rewards/margins": 6.007610321044922, "rewards/rejected": 0.812481701374054, "step": 2774 }, { "epoch": 0.61, "learning_rate": 8.118850107488916e-06, "logits/chosen": -1.6410460472106934, "logits/rejected": -1.50742769241333, "logps/chosen": -117.67837524414062, "logps/rejected": -63.504432678222656, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 7.787878513336182, "rewards/margins": 6.475142478942871, "rewards/rejected": 1.3127357959747314, "step": 2775 }, { "epoch": 0.61, "learning_rate": 8.117449009293668e-06, "logits/chosen": -1.1215695142745972, "logits/rejected": -1.1215695142745972, "logps/chosen": -52.40464782714844, "logps/rejected": -52.40464782714844, "loss": 0.4089, "rewards/accuracies": 0.0, "rewards/chosen": 4.252677917480469, "rewards/margins": 0.0, "rewards/rejected": 4.252677917480469, "step": 2776 }, { "epoch": 0.61, "learning_rate": 8.116047510513718e-06, "logits/chosen": -1.4935921430587769, "logits/rejected": -1.532389760017395, "logps/chosen": -47.975799560546875, "logps/rejected": -46.266998291015625, "loss": 0.6449, "rewards/accuracies": 0.0, "rewards/chosen": 2.719207763671875, "rewards/margins": -0.8767533302307129, "rewards/rejected": 3.595961093902588, "step": 2777 }, { "epoch": 0.61, "learning_rate": 8.114645611329152e-06, "logits/chosen": -1.8057361841201782, "logits/rejected": -1.8263044357299805, "logps/chosen": -97.89607238769531, "logps/rejected": -36.65038299560547, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": 5.4922637939453125, "rewards/margins": 2.993457317352295, "rewards/rejected": 2.4988064765930176, "step": 2778 }, { "epoch": 0.62, "learning_rate": 8.113243311920113e-06, "logits/chosen": -1.6715041399002075, "logits/rejected": -1.6715041399002075, "logps/chosen": -72.15917205810547, "logps/rejected": -72.15917205810547, "loss": 0.9622, "rewards/accuracies": 0.0, "rewards/chosen": 7.587722301483154, "rewards/margins": 0.0, "rewards/rejected": 7.587722301483154, "step": 2779 }, { "epoch": 0.62, "learning_rate": 8.111840612466792e-06, "logits/chosen": -1.6817375421524048, "logits/rejected": -1.6090788841247559, "logps/chosen": -130.9117889404297, "logps/rejected": -123.85596466064453, "loss": 0.3621, "rewards/accuracies": 1.0, "rewards/chosen": 7.766108989715576, "rewards/margins": 0.2523798942565918, "rewards/rejected": 7.513729095458984, "step": 2780 }, { "epoch": 0.62, "learning_rate": 8.110437513149433e-06, "logits/chosen": -1.944573163986206, "logits/rejected": -1.8495889902114868, "logps/chosen": -155.52227783203125, "logps/rejected": -48.042572021484375, "loss": 2.026, "rewards/accuracies": 1.0, "rewards/chosen": 7.43011474609375, "rewards/margins": 3.2760252952575684, "rewards/rejected": 4.154089450836182, "step": 2781 }, { "epoch": 0.62, "learning_rate": 8.109034014148331e-06, "logits/chosen": -1.405333161354065, "logits/rejected": -1.3090355396270752, "logps/chosen": -62.870147705078125, "logps/rejected": -27.773332595825195, "loss": 0.3579, "rewards/accuracies": 1.0, "rewards/chosen": 3.7976365089416504, "rewards/margins": 2.494098663330078, "rewards/rejected": 1.3035379648208618, "step": 2782 }, { "epoch": 0.62, "learning_rate": 8.107630115643832e-06, "logits/chosen": -1.6660929918289185, "logits/rejected": -1.706146478652954, "logps/chosen": -79.70909118652344, "logps/rejected": -154.56324768066406, "loss": 0.3575, "rewards/accuracies": 0.0, "rewards/chosen": 6.100253582000732, "rewards/margins": -0.01367950439453125, "rewards/rejected": 6.113933086395264, "step": 2783 }, { "epoch": 0.62, "learning_rate": 8.106225817816333e-06, "logits/chosen": -1.466710090637207, "logits/rejected": -1.468950629234314, "logps/chosen": -53.114471435546875, "logps/rejected": -53.19872283935547, "loss": 1.5311, "rewards/accuracies": 0.0, "rewards/chosen": 3.6850433349609375, "rewards/margins": -2.9088282585144043, "rewards/rejected": 6.593871593475342, "step": 2784 }, { "epoch": 0.62, "learning_rate": 8.104821120846287e-06, "logits/chosen": -1.6159253120422363, "logits/rejected": -1.6814439296722412, "logps/chosen": -75.1502914428711, "logps/rejected": -114.47935485839844, "loss": 0.4576, "rewards/accuracies": 1.0, "rewards/chosen": 6.3815131187438965, "rewards/margins": 1.0658941268920898, "rewards/rejected": 5.315618991851807, "step": 2785 }, { "epoch": 0.62, "learning_rate": 8.103416024914186e-06, "logits/chosen": -1.6006969213485718, "logits/rejected": -1.5619570016860962, "logps/chosen": -76.18190002441406, "logps/rejected": -48.844024658203125, "loss": 0.3924, "rewards/accuracies": 1.0, "rewards/chosen": 3.8905227184295654, "rewards/margins": 0.44718384742736816, "rewards/rejected": 3.4433388710021973, "step": 2786 }, { "epoch": 0.62, "learning_rate": 8.102010530200589e-06, "logits/chosen": -1.5718767642974854, "logits/rejected": -1.5417144298553467, "logps/chosen": -66.11996459960938, "logps/rejected": -44.75934982299805, "loss": 1.4905, "rewards/accuracies": 0.0, "rewards/chosen": 2.0761184692382812, "rewards/margins": -2.0494580268859863, "rewards/rejected": 4.125576496124268, "step": 2787 }, { "epoch": 0.62, "learning_rate": 8.100604636886095e-06, "logits/chosen": -1.824786901473999, "logits/rejected": -1.2522549629211426, "logps/chosen": -176.69482421875, "logps/rejected": -52.776336669921875, "loss": 0.4016, "rewards/accuracies": 1.0, "rewards/chosen": 4.1749114990234375, "rewards/margins": 0.49585866928100586, "rewards/rejected": 3.6790528297424316, "step": 2788 }, { "epoch": 0.62, "learning_rate": 8.09919834515136e-06, "logits/chosen": -1.4740561246871948, "logits/rejected": -1.4300010204315186, "logps/chosen": -87.61821746826172, "logps/rejected": -59.000244140625, "loss": 0.139, "rewards/accuracies": 1.0, "rewards/chosen": 6.083148956298828, "rewards/margins": 2.0115623474121094, "rewards/rejected": 4.071586608886719, "step": 2789 }, { "epoch": 0.62, "learning_rate": 8.097791655177085e-06, "logits/chosen": -1.43879234790802, "logits/rejected": -1.48506498336792, "logps/chosen": -103.40006256103516, "logps/rejected": -101.19220733642578, "loss": 3.0002, "rewards/accuracies": 0.0, "rewards/chosen": 2.7835686206817627, "rewards/margins": -5.419734954833984, "rewards/rejected": 8.203303337097168, "step": 2790 }, { "epoch": 0.62, "learning_rate": 8.096384567144033e-06, "logits/chosen": -1.3640819787979126, "logits/rejected": -1.3886977434158325, "logps/chosen": -36.60816955566406, "logps/rejected": -32.814697265625, "loss": 0.6264, "rewards/accuracies": 0.0, "rewards/chosen": 2.7551255226135254, "rewards/margins": -0.9135246276855469, "rewards/rejected": 3.6686501502990723, "step": 2791 }, { "epoch": 0.62, "learning_rate": 8.094977081233006e-06, "logits/chosen": -1.717660903930664, "logits/rejected": -1.6622494459152222, "logps/chosen": -81.98751068115234, "logps/rejected": -111.43732452392578, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": 5.726233959197998, "rewards/margins": 1.601719856262207, "rewards/rejected": 4.124514102935791, "step": 2792 }, { "epoch": 0.62, "learning_rate": 8.093569197624864e-06, "logits/chosen": -1.3139930963516235, "logits/rejected": -1.3375874757766724, "logps/chosen": -83.3499526977539, "logps/rejected": -80.69105529785156, "loss": 0.8515, "rewards/accuracies": 1.0, "rewards/chosen": 3.393270969390869, "rewards/margins": 0.9743010997772217, "rewards/rejected": 2.4189698696136475, "step": 2793 }, { "epoch": 0.62, "learning_rate": 8.092160916500515e-06, "logits/chosen": -1.5601330995559692, "logits/rejected": -1.5387722253799438, "logps/chosen": -39.55651092529297, "logps/rejected": -22.17751693725586, "loss": 2.1728, "rewards/accuracies": 0.0, "rewards/chosen": 1.7153072357177734, "rewards/margins": -0.3544585704803467, "rewards/rejected": 2.06976580619812, "step": 2794 }, { "epoch": 0.62, "learning_rate": 8.090752238040925e-06, "logits/chosen": -1.5427727699279785, "logits/rejected": -1.5427727699279785, "logps/chosen": -36.60582733154297, "logps/rejected": -36.60582733154297, "loss": 0.6901, "rewards/accuracies": 0.0, "rewards/chosen": 5.334175109863281, "rewards/margins": 0.0, "rewards/rejected": 5.334175109863281, "step": 2795 }, { "epoch": 0.62, "learning_rate": 8.0893431624271e-06, "logits/chosen": -1.69246244430542, "logits/rejected": -1.682335615158081, "logps/chosen": -42.83979797363281, "logps/rejected": -52.25941848754883, "loss": 1.3414, "rewards/accuracies": 1.0, "rewards/chosen": 2.6302735805511475, "rewards/margins": 0.035605430603027344, "rewards/rejected": 2.59466814994812, "step": 2796 }, { "epoch": 0.62, "learning_rate": 8.087933689840107e-06, "logits/chosen": -1.4530889987945557, "logits/rejected": -1.4995121955871582, "logps/chosen": -98.01699829101562, "logps/rejected": -92.35995483398438, "loss": 3.2447, "rewards/accuracies": 0.0, "rewards/chosen": 2.024610996246338, "rewards/margins": -4.486079216003418, "rewards/rejected": 6.510690212249756, "step": 2797 }, { "epoch": 0.62, "learning_rate": 8.086523820461057e-06, "logits/chosen": -1.4793171882629395, "logits/rejected": -1.4362256526947021, "logps/chosen": -51.86662292480469, "logps/rejected": -52.68547821044922, "loss": 0.1433, "rewards/accuracies": 1.0, "rewards/chosen": 3.4793946743011475, "rewards/margins": 1.157628059387207, "rewards/rejected": 2.3217666149139404, "step": 2798 }, { "epoch": 0.62, "learning_rate": 8.085113554471115e-06, "logits/chosen": -1.6192916631698608, "logits/rejected": -1.4366666078567505, "logps/chosen": -116.93267822265625, "logps/rejected": -40.11284637451172, "loss": 0.1295, "rewards/accuracies": 1.0, "rewards/chosen": 5.1416168212890625, "rewards/margins": 2.027125597000122, "rewards/rejected": 3.1144912242889404, "step": 2799 }, { "epoch": 0.62, "learning_rate": 8.083702892051499e-06, "logits/chosen": -1.5569778680801392, "logits/rejected": -1.4966816902160645, "logps/chosen": -139.29165649414062, "logps/rejected": -73.48831176757812, "loss": 1.983, "rewards/accuracies": 0.0, "rewards/chosen": 6.470343112945557, "rewards/margins": -0.7200760841369629, "rewards/rejected": 7.1904191970825195, "step": 2800 }, { "epoch": 0.62, "learning_rate": 8.082291833383475e-06, "logits/chosen": -1.619220495223999, "logits/rejected": -1.5771881341934204, "logps/chosen": -63.656375885009766, "logps/rejected": -44.005619049072266, "loss": 0.6209, "rewards/accuracies": 0.0, "rewards/chosen": 2.513274908065796, "rewards/margins": -0.10843944549560547, "rewards/rejected": 2.6217143535614014, "step": 2801 }, { "epoch": 0.62, "learning_rate": 8.080880378648359e-06, "logits/chosen": -1.568189263343811, "logits/rejected": -1.620106816291809, "logps/chosen": -44.87483215332031, "logps/rejected": -104.86833190917969, "loss": 2.8502, "rewards/accuracies": 0.0, "rewards/chosen": 2.019786834716797, "rewards/margins": -5.631368160247803, "rewards/rejected": 7.6511549949646, "step": 2802 }, { "epoch": 0.62, "learning_rate": 8.079468528027519e-06, "logits/chosen": -1.7062978744506836, "logits/rejected": -1.7622392177581787, "logps/chosen": -96.44601440429688, "logps/rejected": -90.22819519042969, "loss": 3.8132, "rewards/accuracies": 0.0, "rewards/chosen": 4.991611003875732, "rewards/margins": -7.622378826141357, "rewards/rejected": 12.61398983001709, "step": 2803 }, { "epoch": 0.62, "learning_rate": 8.078056281702378e-06, "logits/chosen": -1.5556942224502563, "logits/rejected": -1.5335558652877808, "logps/chosen": -76.87614440917969, "logps/rejected": -78.18160247802734, "loss": 0.9445, "rewards/accuracies": 0.0, "rewards/chosen": 2.714338779449463, "rewards/margins": -0.022157907485961914, "rewards/rejected": 2.736496686935425, "step": 2804 }, { "epoch": 0.62, "learning_rate": 8.076643639854405e-06, "logits/chosen": -1.5360705852508545, "logits/rejected": -1.4853821992874146, "logps/chosen": -120.16644287109375, "logps/rejected": -57.842857360839844, "loss": 1.897, "rewards/accuracies": 0.0, "rewards/chosen": 5.096202373504639, "rewards/margins": -0.3060111999511719, "rewards/rejected": 5.4022135734558105, "step": 2805 }, { "epoch": 0.62, "learning_rate": 8.075230602665118e-06, "logits/chosen": -1.413045048713684, "logits/rejected": -1.3807048797607422, "logps/chosen": -82.7156982421875, "logps/rejected": -59.25922393798828, "loss": 0.3813, "rewards/accuracies": 1.0, "rewards/chosen": 4.733888149261475, "rewards/margins": 0.6504340171813965, "rewards/rejected": 4.083454132080078, "step": 2806 }, { "epoch": 0.62, "learning_rate": 8.073817170316093e-06, "logits/chosen": -1.2775346040725708, "logits/rejected": -1.2607485055923462, "logps/chosen": -44.77186965942383, "logps/rejected": -38.86555480957031, "loss": 0.7192, "rewards/accuracies": 1.0, "rewards/chosen": 2.1060001850128174, "rewards/margins": 0.030362367630004883, "rewards/rejected": 2.0756378173828125, "step": 2807 }, { "epoch": 0.62, "learning_rate": 8.07240334298895e-06, "logits/chosen": -1.6831752061843872, "logits/rejected": -1.5706642866134644, "logps/chosen": -149.46275329589844, "logps/rejected": -97.44835662841797, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 8.968998908996582, "rewards/margins": 3.7283759117126465, "rewards/rejected": 5.2406229972839355, "step": 2808 }, { "epoch": 0.62, "learning_rate": 8.070989120865362e-06, "logits/chosen": -1.4212863445281982, "logits/rejected": -1.188462257385254, "logps/chosen": -72.59125518798828, "logps/rejected": -90.74028015136719, "loss": 0.3215, "rewards/accuracies": 1.0, "rewards/chosen": 4.211974620819092, "rewards/margins": 0.1031808853149414, "rewards/rejected": 4.10879373550415, "step": 2809 }, { "epoch": 0.62, "learning_rate": 8.069574504127058e-06, "logits/chosen": -1.8178410530090332, "logits/rejected": -1.5992465019226074, "logps/chosen": -84.16596984863281, "logps/rejected": -63.443851470947266, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 6.0370988845825195, "rewards/margins": 4.826225280761719, "rewards/rejected": 1.2108738422393799, "step": 2810 }, { "epoch": 0.62, "learning_rate": 8.068159492955806e-06, "logits/chosen": -1.7390763759613037, "logits/rejected": -1.7602295875549316, "logps/chosen": -60.18226623535156, "logps/rejected": -89.42153930664062, "loss": 1.4204, "rewards/accuracies": 0.0, "rewards/chosen": 3.3682656288146973, "rewards/margins": -2.746504783630371, "rewards/rejected": 6.114770412445068, "step": 2811 }, { "epoch": 0.62, "learning_rate": 8.066744087533436e-06, "logits/chosen": -1.2636445760726929, "logits/rejected": -1.1752151250839233, "logps/chosen": -67.81292724609375, "logps/rejected": -74.22743225097656, "loss": 1.6248, "rewards/accuracies": 0.0, "rewards/chosen": 2.6996383666992188, "rewards/margins": -0.7804381847381592, "rewards/rejected": 3.480076551437378, "step": 2812 }, { "epoch": 0.62, "learning_rate": 8.065328288041823e-06, "logits/chosen": -1.548757791519165, "logits/rejected": -1.457107663154602, "logps/chosen": -102.2509765625, "logps/rejected": -85.46161651611328, "loss": 0.1298, "rewards/accuracies": 1.0, "rewards/chosen": 8.119343757629395, "rewards/margins": 4.393532752990723, "rewards/rejected": 3.725811004638672, "step": 2813 }, { "epoch": 0.62, "learning_rate": 8.063912094662893e-06, "logits/chosen": -1.4127349853515625, "logits/rejected": -1.4127349853515625, "logps/chosen": -70.54203796386719, "logps/rejected": -70.54203796386719, "loss": 1.3515, "rewards/accuracies": 0.0, "rewards/chosen": 3.53605055809021, "rewards/margins": 0.0, "rewards/rejected": 3.53605055809021, "step": 2814 }, { "epoch": 0.62, "learning_rate": 8.062495507578628e-06, "logits/chosen": -1.5161784887313843, "logits/rejected": -1.53145432472229, "logps/chosen": -38.516563415527344, "logps/rejected": -44.95387268066406, "loss": 1.3786, "rewards/accuracies": 0.0, "rewards/chosen": 3.0432121753692627, "rewards/margins": -2.160324811935425, "rewards/rejected": 5.2035369873046875, "step": 2815 }, { "epoch": 0.62, "learning_rate": 8.061078526971048e-06, "logits/chosen": -1.5134379863739014, "logits/rejected": -1.4063737392425537, "logps/chosen": -48.47138214111328, "logps/rejected": -71.75819396972656, "loss": 0.1729, "rewards/accuracies": 1.0, "rewards/chosen": 4.412259578704834, "rewards/margins": 1.0225179195404053, "rewards/rejected": 3.3897416591644287, "step": 2816 }, { "epoch": 0.62, "learning_rate": 8.059661153022236e-06, "logits/chosen": -1.9089404344558716, "logits/rejected": -1.861600399017334, "logps/chosen": -34.49928283691406, "logps/rejected": -61.23643493652344, "loss": 1.3837, "rewards/accuracies": 1.0, "rewards/chosen": 1.9834511280059814, "rewards/margins": 0.0034035444259643555, "rewards/rejected": 1.980047583580017, "step": 2817 }, { "epoch": 0.62, "learning_rate": 8.058243385914324e-06, "logits/chosen": -1.6438558101654053, "logits/rejected": -1.667046070098877, "logps/chosen": -67.77560424804688, "logps/rejected": -50.35944366455078, "loss": 1.8741, "rewards/accuracies": 0.0, "rewards/chosen": 2.061497449874878, "rewards/margins": -3.5566246509552, "rewards/rejected": 5.618122100830078, "step": 2818 }, { "epoch": 0.62, "learning_rate": 8.056825225829486e-06, "logits/chosen": -1.5485563278198242, "logits/rejected": -1.5877946615219116, "logps/chosen": -114.73931121826172, "logps/rejected": -122.2524185180664, "loss": 2.6419, "rewards/accuracies": 0.0, "rewards/chosen": 5.208222389221191, "rewards/margins": -4.202768325805664, "rewards/rejected": 9.410990715026855, "step": 2819 }, { "epoch": 0.62, "learning_rate": 8.055406672949957e-06, "logits/chosen": -1.3752703666687012, "logits/rejected": -1.3777228593826294, "logps/chosen": -33.27470397949219, "logps/rejected": -45.97933578491211, "loss": 1.0598, "rewards/accuracies": 0.0, "rewards/chosen": 1.720607042312622, "rewards/margins": -1.7029500007629395, "rewards/rejected": 3.4235570430755615, "step": 2820 }, { "epoch": 0.62, "learning_rate": 8.053987727458013e-06, "logits/chosen": -1.555456280708313, "logits/rejected": -1.4473518133163452, "logps/chosen": -121.88523864746094, "logps/rejected": -76.66817474365234, "loss": 0.7675, "rewards/accuracies": 1.0, "rewards/chosen": 6.8610124588012695, "rewards/margins": 2.9634957313537598, "rewards/rejected": 3.8975167274475098, "step": 2821 }, { "epoch": 0.62, "learning_rate": 8.05256838953599e-06, "logits/chosen": -1.3550214767456055, "logits/rejected": -1.3424322605133057, "logps/chosen": -73.00404357910156, "logps/rejected": -79.95953369140625, "loss": 2.2621, "rewards/accuracies": 0.0, "rewards/chosen": 4.603498935699463, "rewards/margins": -0.4742417335510254, "rewards/rejected": 5.077740669250488, "step": 2822 }, { "epoch": 0.62, "learning_rate": 8.051148659366265e-06, "logits/chosen": -1.701373815536499, "logits/rejected": -1.722967505455017, "logps/chosen": -108.2644271850586, "logps/rejected": -109.083740234375, "loss": 2.1999, "rewards/accuracies": 0.0, "rewards/chosen": 5.237732887268066, "rewards/margins": -3.4954843521118164, "rewards/rejected": 8.733217239379883, "step": 2823 }, { "epoch": 0.63, "learning_rate": 8.049728537131275e-06, "logits/chosen": -1.648646354675293, "logits/rejected": -1.6207811832427979, "logps/chosen": -71.77879333496094, "logps/rejected": -39.79570388793945, "loss": 0.2195, "rewards/accuracies": 1.0, "rewards/chosen": 3.3368232250213623, "rewards/margins": 0.8705408573150635, "rewards/rejected": 2.466282367706299, "step": 2824 }, { "epoch": 0.63, "learning_rate": 8.048308023013498e-06, "logits/chosen": -1.8172872066497803, "logits/rejected": -1.7791311740875244, "logps/chosen": -61.45093536376953, "logps/rejected": -73.70709991455078, "loss": 0.6716, "rewards/accuracies": 1.0, "rewards/chosen": 2.631260633468628, "rewards/margins": 1.3841391801834106, "rewards/rejected": 1.2471214532852173, "step": 2825 }, { "epoch": 0.63, "learning_rate": 8.046887117195467e-06, "logits/chosen": -1.6540932655334473, "logits/rejected": -1.60067880153656, "logps/chosen": -108.24089050292969, "logps/rejected": -141.29736328125, "loss": 0.1022, "rewards/accuracies": 1.0, "rewards/chosen": 8.656962394714355, "rewards/margins": 1.6703486442565918, "rewards/rejected": 6.986613750457764, "step": 2826 }, { "epoch": 0.63, "learning_rate": 8.045465819859766e-06, "logits/chosen": -1.196333885192871, "logits/rejected": -1.2356218099594116, "logps/chosen": -12.89303970336914, "logps/rejected": -27.97663116455078, "loss": 1.4038, "rewards/accuracies": 1.0, "rewards/chosen": 1.0550174713134766, "rewards/margins": 0.058236658573150635, "rewards/rejected": 0.9967808127403259, "step": 2827 }, { "epoch": 0.63, "learning_rate": 8.044044131189029e-06, "logits/chosen": -1.7061042785644531, "logits/rejected": -1.645607590675354, "logps/chosen": -64.06494903564453, "logps/rejected": -21.494516372680664, "loss": 0.1398, "rewards/accuracies": 1.0, "rewards/chosen": 1.1970123052597046, "rewards/margins": 1.240496277809143, "rewards/rejected": -0.0434839241206646, "step": 2828 }, { "epoch": 0.63, "learning_rate": 8.042622051365938e-06, "logits/chosen": -1.6013352870941162, "logits/rejected": -1.6116394996643066, "logps/chosen": -84.48771667480469, "logps/rejected": -75.91095733642578, "loss": 1.2137, "rewards/accuracies": 0.0, "rewards/chosen": 3.4771926403045654, "rewards/margins": -2.127854108810425, "rewards/rejected": 5.60504674911499, "step": 2829 }, { "epoch": 0.63, "learning_rate": 8.041199580573229e-06, "logits/chosen": -1.3463218212127686, "logits/rejected": -1.306404709815979, "logps/chosen": -72.54959869384766, "logps/rejected": -78.58172607421875, "loss": 0.4377, "rewards/accuracies": 0.0, "rewards/chosen": 1.8731330633163452, "rewards/margins": -0.07365953922271729, "rewards/rejected": 1.9467926025390625, "step": 2830 }, { "epoch": 0.63, "learning_rate": 8.039776718993683e-06, "logits/chosen": -1.6950726509094238, "logits/rejected": -1.663689374923706, "logps/chosen": -51.308624267578125, "logps/rejected": -132.67343139648438, "loss": 3.5332, "rewards/accuracies": 0.0, "rewards/chosen": 4.571442604064941, "rewards/margins": -4.116056442260742, "rewards/rejected": 8.687499046325684, "step": 2831 }, { "epoch": 0.63, "learning_rate": 8.038353466810137e-06, "logits/chosen": -1.5834980010986328, "logits/rejected": -1.469028115272522, "logps/chosen": -63.52402877807617, "logps/rejected": -36.15395736694336, "loss": 0.1944, "rewards/accuracies": 1.0, "rewards/chosen": 5.0348639488220215, "rewards/margins": 0.7581667900085449, "rewards/rejected": 4.276697158813477, "step": 2832 }, { "epoch": 0.63, "learning_rate": 8.036929824205476e-06, "logits/chosen": -1.5209524631500244, "logits/rejected": -1.3321774005889893, "logps/chosen": -98.67471313476562, "logps/rejected": -48.28287887573242, "loss": 0.3943, "rewards/accuracies": 1.0, "rewards/chosen": 6.164285182952881, "rewards/margins": 2.3020968437194824, "rewards/rejected": 3.8621883392333984, "step": 2833 }, { "epoch": 0.63, "learning_rate": 8.03550579136263e-06, "logits/chosen": -1.6338821649551392, "logits/rejected": -1.5896168947219849, "logps/chosen": -96.0419921875, "logps/rejected": -115.20957946777344, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": 7.818341255187988, "rewards/margins": 2.9006118774414062, "rewards/rejected": 4.917729377746582, "step": 2834 }, { "epoch": 0.63, "learning_rate": 8.03408136846459e-06, "logits/chosen": -1.5308947563171387, "logits/rejected": -1.4273208379745483, "logps/chosen": -122.84033966064453, "logps/rejected": -72.70091247558594, "loss": 0.1591, "rewards/accuracies": 1.0, "rewards/chosen": 5.38366174697876, "rewards/margins": 2.641582489013672, "rewards/rejected": 2.742079257965088, "step": 2835 }, { "epoch": 0.63, "learning_rate": 8.032656555694388e-06, "logits/chosen": -1.5331428050994873, "logits/rejected": -1.4689645767211914, "logps/chosen": -129.64947509765625, "logps/rejected": -63.90608215332031, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": 6.561084270477295, "rewards/margins": 3.3005449771881104, "rewards/rejected": 3.2605392932891846, "step": 2836 }, { "epoch": 0.63, "learning_rate": 8.031231353235104e-06, "logits/chosen": -1.9658489227294922, "logits/rejected": -1.9005948305130005, "logps/chosen": -42.886348724365234, "logps/rejected": -45.13002014160156, "loss": 0.1379, "rewards/accuracies": 1.0, "rewards/chosen": 2.534608840942383, "rewards/margins": 1.5120277404785156, "rewards/rejected": 1.0225811004638672, "step": 2837 }, { "epoch": 0.63, "learning_rate": 8.029805761269881e-06, "logits/chosen": -1.2111854553222656, "logits/rejected": -1.2111854553222656, "logps/chosen": -72.08949279785156, "logps/rejected": -72.08949279785156, "loss": 2.0176, "rewards/accuracies": 0.0, "rewards/chosen": 1.8381439447402954, "rewards/margins": 0.0, "rewards/rejected": 1.8381439447402954, "step": 2838 }, { "epoch": 0.63, "learning_rate": 8.028379779981902e-06, "logits/chosen": -1.4948252439498901, "logits/rejected": -1.2855768203735352, "logps/chosen": -50.847354888916016, "logps/rejected": -58.62403869628906, "loss": 0.3423, "rewards/accuracies": 1.0, "rewards/chosen": 2.4936587810516357, "rewards/margins": 0.5972644090652466, "rewards/rejected": 1.8963943719863892, "step": 2839 }, { "epoch": 0.63, "learning_rate": 8.026953409554402e-06, "logits/chosen": -1.639697551727295, "logits/rejected": -1.6824506521224976, "logps/chosen": -153.05599975585938, "logps/rejected": -79.60525512695312, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": 9.441055297851562, "rewards/margins": 4.217965602874756, "rewards/rejected": 5.223089694976807, "step": 2840 }, { "epoch": 0.63, "learning_rate": 8.025526650170665e-06, "logits/chosen": -1.746206283569336, "logits/rejected": -1.714862585067749, "logps/chosen": -110.94410705566406, "logps/rejected": -95.03184509277344, "loss": 2.9596, "rewards/accuracies": 0.0, "rewards/chosen": 4.612851142883301, "rewards/margins": -2.1251769065856934, "rewards/rejected": 6.738028049468994, "step": 2841 }, { "epoch": 0.63, "learning_rate": 8.024099502014024e-06, "logits/chosen": -1.6943883895874023, "logits/rejected": -1.6184028387069702, "logps/chosen": -139.71560668945312, "logps/rejected": -59.82160186767578, "loss": 0.0822, "rewards/accuracies": 1.0, "rewards/chosen": 4.974940776824951, "rewards/margins": 2.753856897354126, "rewards/rejected": 2.221083879470825, "step": 2842 }, { "epoch": 0.63, "learning_rate": 8.02267196526787e-06, "logits/chosen": -1.983717679977417, "logits/rejected": -1.926957607269287, "logps/chosen": -206.37686157226562, "logps/rejected": -120.582763671875, "loss": 0.4352, "rewards/accuracies": 0.0, "rewards/chosen": 7.9582366943359375, "rewards/margins": -0.24603939056396484, "rewards/rejected": 8.204276084899902, "step": 2843 }, { "epoch": 0.63, "learning_rate": 8.021244040115634e-06, "logits/chosen": -1.740999698638916, "logits/rejected": -1.6040537357330322, "logps/chosen": -82.58378601074219, "logps/rejected": -36.55998229980469, "loss": 0.136, "rewards/accuracies": 1.0, "rewards/chosen": 3.6480331420898438, "rewards/margins": 1.5049102306365967, "rewards/rejected": 2.143122911453247, "step": 2844 }, { "epoch": 0.63, "learning_rate": 8.019815726740801e-06, "logits/chosen": -1.4869368076324463, "logits/rejected": -1.4251627922058105, "logps/chosen": -53.38494110107422, "logps/rejected": -59.60137176513672, "loss": 0.1405, "rewards/accuracies": 1.0, "rewards/chosen": 3.6141107082366943, "rewards/margins": 1.5251634120941162, "rewards/rejected": 2.088947296142578, "step": 2845 }, { "epoch": 0.63, "learning_rate": 8.018387025326906e-06, "logits/chosen": -1.3879952430725098, "logits/rejected": -1.3879952430725098, "logps/chosen": -48.86467742919922, "logps/rejected": -48.86467742919922, "loss": 0.9905, "rewards/accuracies": 0.0, "rewards/chosen": 3.039992570877075, "rewards/margins": 0.0, "rewards/rejected": 3.039992570877075, "step": 2846 }, { "epoch": 0.63, "learning_rate": 8.016957936057535e-06, "logits/chosen": -1.5348762273788452, "logits/rejected": -1.551242470741272, "logps/chosen": -56.780113220214844, "logps/rejected": -79.80209350585938, "loss": 1.9514, "rewards/accuracies": 1.0, "rewards/chosen": 2.6851768493652344, "rewards/margins": 0.2232506275177002, "rewards/rejected": 2.461926221847534, "step": 2847 }, { "epoch": 0.63, "learning_rate": 8.015528459116321e-06, "logits/chosen": -1.4555641412734985, "logits/rejected": -1.4369521141052246, "logps/chosen": -34.80532455444336, "logps/rejected": -29.149410247802734, "loss": 0.3425, "rewards/accuracies": 1.0, "rewards/chosen": 3.4191181659698486, "rewards/margins": 0.22393488883972168, "rewards/rejected": 3.195183277130127, "step": 2848 }, { "epoch": 0.63, "learning_rate": 8.014098594686951e-06, "logits/chosen": -1.461679458618164, "logits/rejected": -1.443433403968811, "logps/chosen": -20.64250373840332, "logps/rejected": -48.91116714477539, "loss": 0.9584, "rewards/accuracies": 1.0, "rewards/chosen": 2.308079481124878, "rewards/margins": 0.41847336292266846, "rewards/rejected": 1.8896061182022095, "step": 2849 }, { "epoch": 0.63, "learning_rate": 8.012668342953155e-06, "logits/chosen": -1.5938471555709839, "logits/rejected": -1.5057144165039062, "logps/chosen": -58.20525360107422, "logps/rejected": -60.167598724365234, "loss": 0.1504, "rewards/accuracies": 1.0, "rewards/chosen": 4.63290548324585, "rewards/margins": 2.3479578495025635, "rewards/rejected": 2.284947633743286, "step": 2850 }, { "epoch": 0.63, "learning_rate": 8.011237704098721e-06, "logits/chosen": -1.8785771131515503, "logits/rejected": -1.670090675354004, "logps/chosen": -86.9149398803711, "logps/rejected": -27.097423553466797, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 9.03700065612793, "rewards/margins": 8.569754600524902, "rewards/rejected": 0.4672456681728363, "step": 2851 }, { "epoch": 0.63, "learning_rate": 8.00980667830748e-06, "logits/chosen": -1.6997065544128418, "logits/rejected": -1.6590471267700195, "logps/chosen": -72.30536651611328, "logps/rejected": -89.61518859863281, "loss": 1.8118, "rewards/accuracies": 1.0, "rewards/chosen": 3.4684722423553467, "rewards/margins": 0.07770538330078125, "rewards/rejected": 3.3907668590545654, "step": 2852 }, { "epoch": 0.63, "learning_rate": 8.008375265763317e-06, "logits/chosen": -1.486297607421875, "logits/rejected": -1.4786264896392822, "logps/chosen": -6.526932716369629, "logps/rejected": -18.288673400878906, "loss": 1.7171, "rewards/accuracies": 0.0, "rewards/chosen": 0.44967833161354065, "rewards/margins": -0.08026531338691711, "rewards/rejected": 0.5299436450004578, "step": 2853 }, { "epoch": 0.63, "learning_rate": 8.006943466650163e-06, "logits/chosen": -1.7540791034698486, "logits/rejected": -1.71438467502594, "logps/chosen": -46.919612884521484, "logps/rejected": -33.685081481933594, "loss": 0.435, "rewards/accuracies": 0.0, "rewards/chosen": 1.8833118677139282, "rewards/margins": -0.3268626928329468, "rewards/rejected": 2.210174560546875, "step": 2854 }, { "epoch": 0.63, "learning_rate": 8.005511281152004e-06, "logits/chosen": -1.8476108312606812, "logits/rejected": -1.8601763248443604, "logps/chosen": -65.22071075439453, "logps/rejected": -83.58580017089844, "loss": 0.2503, "rewards/accuracies": 1.0, "rewards/chosen": 4.26326847076416, "rewards/margins": 0.4927804470062256, "rewards/rejected": 3.7704880237579346, "step": 2855 }, { "epoch": 0.63, "learning_rate": 8.004078709452869e-06, "logits/chosen": -1.421208381652832, "logits/rejected": -1.375022530555725, "logps/chosen": -33.84968948364258, "logps/rejected": -39.40010452270508, "loss": 2.0799, "rewards/accuracies": 0.0, "rewards/chosen": 1.132158637046814, "rewards/margins": -1.9749196767807007, "rewards/rejected": 3.1070783138275146, "step": 2856 }, { "epoch": 0.63, "learning_rate": 8.002645751736841e-06, "logits/chosen": -1.5881905555725098, "logits/rejected": -1.4767224788665771, "logps/chosen": -122.07026672363281, "logps/rejected": -29.209291458129883, "loss": 0.346, "rewards/accuracies": 1.0, "rewards/chosen": 4.171962261199951, "rewards/margins": 0.24489092826843262, "rewards/rejected": 3.9270713329315186, "step": 2857 }, { "epoch": 0.63, "learning_rate": 8.001212408188052e-06, "logits/chosen": -1.5148342847824097, "logits/rejected": -1.4432997703552246, "logps/chosen": -44.69174575805664, "logps/rejected": -11.588768005371094, "loss": 0.0623, "rewards/accuracies": 1.0, "rewards/chosen": 2.4242336750030518, "rewards/margins": 2.157402753829956, "rewards/rejected": 0.26683101058006287, "step": 2858 }, { "epoch": 0.63, "learning_rate": 7.999778678990685e-06, "logits/chosen": -1.721701741218567, "logits/rejected": -1.605255365371704, "logps/chosen": -111.01289367675781, "logps/rejected": -53.1660041809082, "loss": 0.1739, "rewards/accuracies": 1.0, "rewards/chosen": 7.359950542449951, "rewards/margins": 3.0643911361694336, "rewards/rejected": 4.295559406280518, "step": 2859 }, { "epoch": 0.63, "learning_rate": 7.998344564328967e-06, "logits/chosen": -1.4999277591705322, "logits/rejected": -1.4895110130310059, "logps/chosen": -22.544952392578125, "logps/rejected": -112.45350646972656, "loss": 1.1439, "rewards/accuracies": 0.0, "rewards/chosen": 3.205087423324585, "rewards/margins": -0.48836517333984375, "rewards/rejected": 3.6934525966644287, "step": 2860 }, { "epoch": 0.63, "learning_rate": 7.996910064387181e-06, "logits/chosen": -1.4944348335266113, "logits/rejected": -1.4714992046356201, "logps/chosen": -47.32164001464844, "logps/rejected": -43.45805358886719, "loss": 0.1541, "rewards/accuracies": 1.0, "rewards/chosen": 3.1231369972229004, "rewards/margins": 1.3639527559280396, "rewards/rejected": 1.7591842412948608, "step": 2861 }, { "epoch": 0.63, "learning_rate": 7.995475179349657e-06, "logits/chosen": -1.3619744777679443, "logits/rejected": -1.347195029258728, "logps/chosen": -61.33715057373047, "logps/rejected": -79.83988952636719, "loss": 3.1735, "rewards/accuracies": 0.0, "rewards/chosen": 1.6548866033554077, "rewards/margins": -1.403678297996521, "rewards/rejected": 3.0585649013519287, "step": 2862 }, { "epoch": 0.63, "learning_rate": 7.994039909400773e-06, "logits/chosen": -1.4543366432189941, "logits/rejected": -1.42680823802948, "logps/chosen": -61.37115478515625, "logps/rejected": -72.95401000976562, "loss": 0.3285, "rewards/accuracies": 1.0, "rewards/chosen": 2.7757568359375, "rewards/margins": 0.5592131614685059, "rewards/rejected": 2.216543674468994, "step": 2863 }, { "epoch": 0.63, "learning_rate": 7.992604254724957e-06, "logits/chosen": -1.7545876502990723, "logits/rejected": -1.7069894075393677, "logps/chosen": -140.59007263183594, "logps/rejected": -80.78471374511719, "loss": 0.1163, "rewards/accuracies": 1.0, "rewards/chosen": 7.984660625457764, "rewards/margins": 2.105989456176758, "rewards/rejected": 5.878671169281006, "step": 2864 }, { "epoch": 0.63, "learning_rate": 7.991168215506688e-06, "logits/chosen": -1.9931820631027222, "logits/rejected": -1.9874428510665894, "logps/chosen": -61.8366584777832, "logps/rejected": -79.87367248535156, "loss": 0.1731, "rewards/accuracies": 1.0, "rewards/chosen": 3.7642345428466797, "rewards/margins": 0.914201021194458, "rewards/rejected": 2.8500335216522217, "step": 2865 }, { "epoch": 0.63, "learning_rate": 7.989731791930497e-06, "logits/chosen": -1.4797323942184448, "logits/rejected": -1.5312737226486206, "logps/chosen": -73.24539184570312, "logps/rejected": -120.062255859375, "loss": 1.0058, "rewards/accuracies": 0.0, "rewards/chosen": 5.8418731689453125, "rewards/margins": -1.8444671630859375, "rewards/rejected": 7.68634033203125, "step": 2866 }, { "epoch": 0.63, "learning_rate": 7.988294984180956e-06, "logits/chosen": -1.5548890829086304, "logits/rejected": -1.4667603969573975, "logps/chosen": -56.79068374633789, "logps/rejected": -14.579488754272461, "loss": 0.785, "rewards/accuracies": 1.0, "rewards/chosen": 2.914950132369995, "rewards/margins": 1.8914605379104614, "rewards/rejected": 1.0234895944595337, "step": 2867 }, { "epoch": 0.63, "learning_rate": 7.986857792442692e-06, "logits/chosen": -1.297258734703064, "logits/rejected": -1.2712373733520508, "logps/chosen": -18.809938430786133, "logps/rejected": -16.668397903442383, "loss": 0.8204, "rewards/accuracies": 0.0, "rewards/chosen": 1.2468369007110596, "rewards/margins": -0.05288088321685791, "rewards/rejected": 1.2997177839279175, "step": 2868 }, { "epoch": 0.64, "learning_rate": 7.985420216900384e-06, "logits/chosen": -1.6843557357788086, "logits/rejected": -1.5702693462371826, "logps/chosen": -70.1214599609375, "logps/rejected": -142.72056579589844, "loss": 2.3573, "rewards/accuracies": 0.0, "rewards/chosen": 2.3303489685058594, "rewards/margins": -4.087709903717041, "rewards/rejected": 6.4180588722229, "step": 2869 }, { "epoch": 0.64, "learning_rate": 7.983982257738752e-06, "logits/chosen": -1.7111297845840454, "logits/rejected": -1.7164173126220703, "logps/chosen": -64.2790756225586, "logps/rejected": -36.68019104003906, "loss": 0.4134, "rewards/accuracies": 0.0, "rewards/chosen": 3.0260751247406006, "rewards/margins": -0.23851609230041504, "rewards/rejected": 3.2645912170410156, "step": 2870 }, { "epoch": 0.64, "learning_rate": 7.982543915142575e-06, "logits/chosen": -1.6221989393234253, "logits/rejected": -1.5919828414916992, "logps/chosen": -44.73754119873047, "logps/rejected": -60.819313049316406, "loss": 1.0133, "rewards/accuracies": 0.0, "rewards/chosen": 4.050225257873535, "rewards/margins": -1.1024932861328125, "rewards/rejected": 5.152718544006348, "step": 2871 }, { "epoch": 0.64, "learning_rate": 7.981105189296676e-06, "logits/chosen": -1.8494377136230469, "logits/rejected": -1.7719179391860962, "logps/chosen": -77.839111328125, "logps/rejected": -45.178077697753906, "loss": 1.6092, "rewards/accuracies": 1.0, "rewards/chosen": 2.630201816558838, "rewards/margins": 0.36754536628723145, "rewards/rejected": 2.2626564502716064, "step": 2872 }, { "epoch": 0.64, "learning_rate": 7.979666080385923e-06, "logits/chosen": -1.0876543521881104, "logits/rejected": -1.0876543521881104, "logps/chosen": -31.74211311340332, "logps/rejected": -31.74211311340332, "loss": 0.3913, "rewards/accuracies": 0.0, "rewards/chosen": 2.0590689182281494, "rewards/margins": 0.0, "rewards/rejected": 2.0590689182281494, "step": 2873 }, { "epoch": 0.64, "learning_rate": 7.978226588595245e-06, "logits/chosen": -1.5637822151184082, "logits/rejected": -1.5673449039459229, "logps/chosen": -82.52275085449219, "logps/rejected": -42.433494567871094, "loss": 0.3723, "rewards/accuracies": 0.0, "rewards/chosen": 3.2780227661132812, "rewards/margins": -0.059493303298950195, "rewards/rejected": 3.3375160694122314, "step": 2874 }, { "epoch": 0.64, "learning_rate": 7.976786714109608e-06, "logits/chosen": -1.5517114400863647, "logits/rejected": -1.4750019311904907, "logps/chosen": -91.2479248046875, "logps/rejected": -46.07012939453125, "loss": 0.2397, "rewards/accuracies": 1.0, "rewards/chosen": 5.0584564208984375, "rewards/margins": 1.6534552574157715, "rewards/rejected": 3.405001163482666, "step": 2875 }, { "epoch": 0.64, "learning_rate": 7.975346457114034e-06, "logits/chosen": -1.7975877523422241, "logits/rejected": -1.802634358406067, "logps/chosen": -78.68766021728516, "logps/rejected": -60.03446960449219, "loss": 0.1836, "rewards/accuracies": 1.0, "rewards/chosen": 5.765683650970459, "rewards/margins": 0.8130283355712891, "rewards/rejected": 4.95265531539917, "step": 2876 }, { "epoch": 0.64, "learning_rate": 7.973905817793594e-06, "logits/chosen": -1.7103594541549683, "logits/rejected": -1.7206058502197266, "logps/chosen": -60.77793884277344, "logps/rejected": -102.93116760253906, "loss": 1.454, "rewards/accuracies": 0.0, "rewards/chosen": 2.1339683532714844, "rewards/margins": -1.3275933265686035, "rewards/rejected": 3.461561679840088, "step": 2877 }, { "epoch": 0.64, "learning_rate": 7.972464796333408e-06, "logits/chosen": -1.360028624534607, "logits/rejected": -1.3506789207458496, "logps/chosen": -6.136240005493164, "logps/rejected": -13.847254753112793, "loss": 0.2822, "rewards/accuracies": 1.0, "rewards/chosen": 1.8093146085739136, "rewards/margins": 0.4613785743713379, "rewards/rejected": 1.3479360342025757, "step": 2878 }, { "epoch": 0.64, "learning_rate": 7.971023392918637e-06, "logits/chosen": -1.7162915468215942, "logits/rejected": -1.6864581108093262, "logps/chosen": -40.56526184082031, "logps/rejected": -45.79459762573242, "loss": 1.9388, "rewards/accuracies": 1.0, "rewards/chosen": 2.809467315673828, "rewards/margins": 0.36308932304382324, "rewards/rejected": 2.446377992630005, "step": 2879 }, { "epoch": 0.64, "learning_rate": 7.969581607734504e-06, "logits/chosen": -1.8074593544006348, "logits/rejected": -1.7613264322280884, "logps/chosen": -100.2909927368164, "logps/rejected": -69.56156921386719, "loss": 0.3352, "rewards/accuracies": 1.0, "rewards/chosen": 4.769887447357178, "rewards/margins": 0.05038642883300781, "rewards/rejected": 4.71950101852417, "step": 2880 }, { "epoch": 0.64, "learning_rate": 7.968139440966271e-06, "logits/chosen": -1.6258563995361328, "logits/rejected": -1.6357619762420654, "logps/chosen": -41.01101303100586, "logps/rejected": -60.18724060058594, "loss": 1.5087, "rewards/accuracies": 0.0, "rewards/chosen": 2.1079812049865723, "rewards/margins": -1.4313361644744873, "rewards/rejected": 3.5393173694610596, "step": 2881 }, { "epoch": 0.64, "learning_rate": 7.966696892799257e-06, "logits/chosen": -1.5870591402053833, "logits/rejected": -1.523199200630188, "logps/chosen": -26.278745651245117, "logps/rejected": -63.40989685058594, "loss": 0.516, "rewards/accuracies": 0.0, "rewards/chosen": 1.1706434488296509, "rewards/margins": -0.07750105857849121, "rewards/rejected": 1.248144507408142, "step": 2882 }, { "epoch": 0.64, "learning_rate": 7.965253963418825e-06, "logits/chosen": -1.4663310050964355, "logits/rejected": -1.4504755735397339, "logps/chosen": -54.63478088378906, "logps/rejected": -55.97822570800781, "loss": 0.6602, "rewards/accuracies": 0.0, "rewards/chosen": 3.359046220779419, "rewards/margins": -0.3466620445251465, "rewards/rejected": 3.7057082653045654, "step": 2883 }, { "epoch": 0.64, "learning_rate": 7.963810653010385e-06, "logits/chosen": -1.7612450122833252, "logits/rejected": -1.6662912368774414, "logps/chosen": -109.02786254882812, "logps/rejected": -143.55081176757812, "loss": 0.9925, "rewards/accuracies": 0.0, "rewards/chosen": 2.5575759410858154, "rewards/margins": -1.7590715885162354, "rewards/rejected": 4.316647529602051, "step": 2884 }, { "epoch": 0.64, "learning_rate": 7.962366961759402e-06, "logits/chosen": -1.376416563987732, "logits/rejected": -1.3884434700012207, "logps/chosen": -78.36799621582031, "logps/rejected": -115.61068725585938, "loss": 0.7215, "rewards/accuracies": 0.0, "rewards/chosen": 5.834576606750488, "rewards/margins": -1.148022174835205, "rewards/rejected": 6.982598781585693, "step": 2885 }, { "epoch": 0.64, "learning_rate": 7.960922889851386e-06, "logits/chosen": -1.651003360748291, "logits/rejected": -1.651003360748291, "logps/chosen": -44.64503860473633, "logps/rejected": -44.64503860473633, "loss": 0.3631, "rewards/accuracies": 0.0, "rewards/chosen": 3.999835729598999, "rewards/margins": 0.0, "rewards/rejected": 3.999835729598999, "step": 2886 }, { "epoch": 0.64, "learning_rate": 7.959478437471894e-06, "logits/chosen": -1.6319193840026855, "logits/rejected": -1.597301721572876, "logps/chosen": -69.62501525878906, "logps/rejected": -45.88322448730469, "loss": 0.6669, "rewards/accuracies": 0.0, "rewards/chosen": 2.434521436691284, "rewards/margins": -1.0220489501953125, "rewards/rejected": 3.4565703868865967, "step": 2887 }, { "epoch": 0.64, "learning_rate": 7.95803360480654e-06, "logits/chosen": -1.4889289140701294, "logits/rejected": -1.6133878231048584, "logps/chosen": -49.05000686645508, "logps/rejected": -87.88995361328125, "loss": 2.9781, "rewards/accuracies": 0.0, "rewards/chosen": 3.9205892086029053, "rewards/margins": -5.907947540283203, "rewards/rejected": 9.828536987304688, "step": 2888 }, { "epoch": 0.64, "learning_rate": 7.956588392040978e-06, "logits/chosen": -1.5530949831008911, "logits/rejected": -1.551638126373291, "logps/chosen": -72.86575317382812, "logps/rejected": -77.99702453613281, "loss": 0.2299, "rewards/accuracies": 1.0, "rewards/chosen": 7.05242919921875, "rewards/margins": 0.5477142333984375, "rewards/rejected": 6.5047149658203125, "step": 2889 }, { "epoch": 0.64, "learning_rate": 7.955142799360914e-06, "logits/chosen": -1.7531369924545288, "logits/rejected": -1.7323358058929443, "logps/chosen": -57.15669250488281, "logps/rejected": -62.65808868408203, "loss": 0.9222, "rewards/accuracies": 0.0, "rewards/chosen": 2.9177520275115967, "rewards/margins": -1.6401512622833252, "rewards/rejected": 4.557903289794922, "step": 2890 }, { "epoch": 0.64, "learning_rate": 7.953696826952106e-06, "logits/chosen": -1.6370564699172974, "logits/rejected": -1.6195011138916016, "logps/chosen": -123.65774536132812, "logps/rejected": -7.768200874328613, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 6.771705627441406, "rewards/margins": 5.581496715545654, "rewards/rejected": 1.190208911895752, "step": 2891 }, { "epoch": 0.64, "learning_rate": 7.952250475000354e-06, "logits/chosen": -1.6500605344772339, "logits/rejected": -1.5898523330688477, "logps/chosen": -154.3914794921875, "logps/rejected": -72.27923583984375, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 8.657977104187012, "rewards/margins": 5.841928482055664, "rewards/rejected": 2.8160483837127686, "step": 2892 }, { "epoch": 0.64, "learning_rate": 7.950803743691516e-06, "logits/chosen": -1.6004809141159058, "logits/rejected": -1.4043060541152954, "logps/chosen": -61.97270202636719, "logps/rejected": -13.246004104614258, "loss": 0.3212, "rewards/accuracies": 1.0, "rewards/chosen": 2.996382236480713, "rewards/margins": 2.161688804626465, "rewards/rejected": 0.8346933722496033, "step": 2893 }, { "epoch": 0.64, "learning_rate": 7.949356633211487e-06, "logits/chosen": -1.7192559242248535, "logits/rejected": -1.4495785236358643, "logps/chosen": -86.90483093261719, "logps/rejected": -90.64419555664062, "loss": 0.8096, "rewards/accuracies": 0.0, "rewards/chosen": 3.27337646484375, "rewards/margins": -0.9729323387145996, "rewards/rejected": 4.24630880355835, "step": 2894 }, { "epoch": 0.64, "learning_rate": 7.947909143746221e-06, "logits/chosen": -1.7565217018127441, "logits/rejected": -1.6935316324234009, "logps/chosen": -59.90865707397461, "logps/rejected": -74.33648681640625, "loss": 0.362, "rewards/accuracies": 1.0, "rewards/chosen": 3.1083438396453857, "rewards/margins": 0.03281426429748535, "rewards/rejected": 3.0755295753479004, "step": 2895 }, { "epoch": 0.64, "learning_rate": 7.946461275481719e-06, "logits/chosen": -1.5045855045318604, "logits/rejected": -1.4833446741104126, "logps/chosen": -69.83586120605469, "logps/rejected": -41.87608337402344, "loss": 0.6406, "rewards/accuracies": 0.0, "rewards/chosen": 2.0528533458709717, "rewards/margins": -0.2948441505432129, "rewards/rejected": 2.3476974964141846, "step": 2896 }, { "epoch": 0.64, "learning_rate": 7.945013028604026e-06, "logits/chosen": -1.6538242101669312, "logits/rejected": -1.5827676057815552, "logps/chosen": -99.57469940185547, "logps/rejected": -22.678192138671875, "loss": 0.1063, "rewards/accuracies": 1.0, "rewards/chosen": 6.2913336753845215, "rewards/margins": 4.560624122619629, "rewards/rejected": 1.730709433555603, "step": 2897 }, { "epoch": 0.64, "learning_rate": 7.943564403299238e-06, "logits/chosen": -1.683313012123108, "logits/rejected": -1.6798151731491089, "logps/chosen": -55.76877212524414, "logps/rejected": -95.80838775634766, "loss": 0.4845, "rewards/accuracies": 0.0, "rewards/chosen": 4.042651653289795, "rewards/margins": -0.22916603088378906, "rewards/rejected": 4.271817684173584, "step": 2898 }, { "epoch": 0.64, "learning_rate": 7.9421153997535e-06, "logits/chosen": -1.574419617652893, "logits/rejected": -1.5730998516082764, "logps/chosen": -34.233394622802734, "logps/rejected": -48.94706726074219, "loss": 1.0683, "rewards/accuracies": 0.0, "rewards/chosen": 2.3988494873046875, "rewards/margins": -1.6206254959106445, "rewards/rejected": 4.019474983215332, "step": 2899 }, { "epoch": 0.64, "learning_rate": 7.940666018153004e-06, "logits/chosen": -1.3242305517196655, "logits/rejected": -1.3259477615356445, "logps/chosen": -37.50336456298828, "logps/rejected": -62.4401741027832, "loss": 1.5705, "rewards/accuracies": 0.0, "rewards/chosen": 4.031687259674072, "rewards/margins": -1.2925691604614258, "rewards/rejected": 5.324256420135498, "step": 2900 }, { "epoch": 0.64, "learning_rate": 7.939216258683997e-06, "logits/chosen": -1.92141592502594, "logits/rejected": -1.7844271659851074, "logps/chosen": -28.833873748779297, "logps/rejected": -96.0797119140625, "loss": 1.3371, "rewards/accuracies": 1.0, "rewards/chosen": 3.5732197761535645, "rewards/margins": 0.8505649566650391, "rewards/rejected": 2.7226548194885254, "step": 2901 }, { "epoch": 0.64, "learning_rate": 7.937766121532766e-06, "logits/chosen": -1.8581079244613647, "logits/rejected": -1.7117060422897339, "logps/chosen": -84.88908386230469, "logps/rejected": -19.28887176513672, "loss": 0.3452, "rewards/accuracies": 1.0, "rewards/chosen": 5.873649597167969, "rewards/margins": 4.984459400177002, "rewards/rejected": 0.8891903162002563, "step": 2902 }, { "epoch": 0.64, "learning_rate": 7.936315606885649e-06, "logits/chosen": -1.5548044443130493, "logits/rejected": -1.4667307138442993, "logps/chosen": -81.17351531982422, "logps/rejected": -12.007593154907227, "loss": 0.0944, "rewards/accuracies": 1.0, "rewards/chosen": 4.475116729736328, "rewards/margins": 1.6309502124786377, "rewards/rejected": 2.8441665172576904, "step": 2903 }, { "epoch": 0.64, "learning_rate": 7.934864714929036e-06, "logits/chosen": -1.6318892240524292, "logits/rejected": -1.5583831071853638, "logps/chosen": -53.66854476928711, "logps/rejected": -70.99472045898438, "loss": 0.4214, "rewards/accuracies": 0.0, "rewards/chosen": 4.286115646362305, "rewards/margins": -0.2539663314819336, "rewards/rejected": 4.540081977844238, "step": 2904 }, { "epoch": 0.64, "learning_rate": 7.933413445849361e-06, "logits/chosen": -1.596274495124817, "logits/rejected": -1.5811187028884888, "logps/chosen": -50.82221984863281, "logps/rejected": -50.91580581665039, "loss": 1.7205, "rewards/accuracies": 1.0, "rewards/chosen": 3.5522353649139404, "rewards/margins": 1.6217578649520874, "rewards/rejected": 1.930477499961853, "step": 2905 }, { "epoch": 0.64, "learning_rate": 7.931961799833112e-06, "logits/chosen": -1.6453096866607666, "logits/rejected": -1.6053848266601562, "logps/chosen": -43.206512451171875, "logps/rejected": -78.30042266845703, "loss": 0.5498, "rewards/accuracies": 0.0, "rewards/chosen": 2.5311081409454346, "rewards/margins": -0.6762230396270752, "rewards/rejected": 3.2073311805725098, "step": 2906 }, { "epoch": 0.64, "learning_rate": 7.930509777066819e-06, "logits/chosen": -1.7273651361465454, "logits/rejected": -1.725100040435791, "logps/chosen": -32.70256805419922, "logps/rejected": -61.353782653808594, "loss": 1.6987, "rewards/accuracies": 0.0, "rewards/chosen": 2.788536548614502, "rewards/margins": -1.201465129852295, "rewards/rejected": 3.990001678466797, "step": 2907 }, { "epoch": 0.64, "learning_rate": 7.929057377737064e-06, "logits/chosen": -1.437870979309082, "logits/rejected": -1.1705904006958008, "logps/chosen": -31.69556999206543, "logps/rejected": -46.185760498046875, "loss": 0.4977, "rewards/accuracies": 1.0, "rewards/chosen": 2.984283685684204, "rewards/margins": 1.6468943357467651, "rewards/rejected": 1.337389349937439, "step": 2908 }, { "epoch": 0.64, "learning_rate": 7.92760460203048e-06, "logits/chosen": -1.5613821744918823, "logits/rejected": -1.4453685283660889, "logps/chosen": -110.8028564453125, "logps/rejected": -41.97275924682617, "loss": 0.2921, "rewards/accuracies": 1.0, "rewards/chosen": 5.296443462371826, "rewards/margins": 2.879359245300293, "rewards/rejected": 2.417084217071533, "step": 2909 }, { "epoch": 0.64, "learning_rate": 7.926151450133738e-06, "logits/chosen": -1.7659337520599365, "logits/rejected": -1.7336695194244385, "logps/chosen": -45.45111846923828, "logps/rejected": -38.00823211669922, "loss": 0.1021, "rewards/accuracies": 1.0, "rewards/chosen": 3.627584218978882, "rewards/margins": 1.4883763790130615, "rewards/rejected": 2.1392078399658203, "step": 2910 }, { "epoch": 0.64, "learning_rate": 7.924697922233571e-06, "logits/chosen": -1.4204623699188232, "logits/rejected": -1.4450716972351074, "logps/chosen": -38.65385818481445, "logps/rejected": -72.74565124511719, "loss": 1.9092, "rewards/accuracies": 0.0, "rewards/chosen": 3.054316282272339, "rewards/margins": -0.46436500549316406, "rewards/rejected": 3.518681287765503, "step": 2911 }, { "epoch": 0.64, "learning_rate": 7.923244018516751e-06, "logits/chosen": -1.5675896406173706, "logits/rejected": -1.3960671424865723, "logps/chosen": -159.21334838867188, "logps/rejected": -78.27528381347656, "loss": 0.2021, "rewards/accuracies": 1.0, "rewards/chosen": 6.1275177001953125, "rewards/margins": 0.7502670288085938, "rewards/rejected": 5.377250671386719, "step": 2912 }, { "epoch": 0.64, "learning_rate": 7.921789739170102e-06, "logits/chosen": -1.4091626405715942, "logits/rejected": -1.306563138961792, "logps/chosen": -41.69817352294922, "logps/rejected": -34.60848617553711, "loss": 4.4527, "rewards/accuracies": 0.0, "rewards/chosen": 2.661741018295288, "rewards/margins": -0.6100215911865234, "rewards/rejected": 3.2717626094818115, "step": 2913 }, { "epoch": 0.64, "learning_rate": 7.920335084380497e-06, "logits/chosen": -1.5570677518844604, "logits/rejected": -1.5118478536605835, "logps/chosen": -51.47509002685547, "logps/rejected": -29.753482818603516, "loss": 0.4734, "rewards/accuracies": 0.0, "rewards/chosen": 3.612597703933716, "rewards/margins": -0.4538304805755615, "rewards/rejected": 4.066428184509277, "step": 2914 }, { "epoch": 0.65, "learning_rate": 7.918880054334853e-06, "logits/chosen": -1.4614076614379883, "logits/rejected": -1.4497348070144653, "logps/chosen": -199.32003784179688, "logps/rejected": -52.92742156982422, "loss": 1.1214, "rewards/accuracies": 1.0, "rewards/chosen": 6.450341701507568, "rewards/margins": 3.137181043624878, "rewards/rejected": 3.3131606578826904, "step": 2915 }, { "epoch": 0.65, "learning_rate": 7.91742464922014e-06, "logits/chosen": -1.5231984853744507, "logits/rejected": -1.5197142362594604, "logps/chosen": -43.360191345214844, "logps/rejected": -49.36358642578125, "loss": 0.6056, "rewards/accuracies": 0.0, "rewards/chosen": 2.5102570056915283, "rewards/margins": -0.06611013412475586, "rewards/rejected": 2.576367139816284, "step": 2916 }, { "epoch": 0.65, "learning_rate": 7.915968869223372e-06, "logits/chosen": -1.6806377172470093, "logits/rejected": -1.6590577363967896, "logps/chosen": -43.09136962890625, "logps/rejected": -48.338584899902344, "loss": 1.3856, "rewards/accuracies": 0.0, "rewards/chosen": 2.9484848976135254, "rewards/margins": -0.041860103607177734, "rewards/rejected": 2.990345001220703, "step": 2917 }, { "epoch": 0.65, "learning_rate": 7.914512714531612e-06, "logits/chosen": -1.183562994003296, "logits/rejected": -1.183562994003296, "logps/chosen": -16.635866165161133, "logps/rejected": -16.635866165161133, "loss": 0.7127, "rewards/accuracies": 0.0, "rewards/chosen": 0.3688272535800934, "rewards/margins": 0.0, "rewards/rejected": 0.3688272535800934, "step": 2918 }, { "epoch": 0.65, "learning_rate": 7.913056185331978e-06, "logits/chosen": -1.5364575386047363, "logits/rejected": -1.4932172298431396, "logps/chosen": -97.52814483642578, "logps/rejected": -44.55906677246094, "loss": 1.2187, "rewards/accuracies": 1.0, "rewards/chosen": 7.439998626708984, "rewards/margins": 2.529696464538574, "rewards/rejected": 4.91030216217041, "step": 2919 }, { "epoch": 0.65, "learning_rate": 7.911599281811624e-06, "logits/chosen": -1.2975575923919678, "logits/rejected": -1.1292155981063843, "logps/chosen": -59.26670837402344, "logps/rejected": -37.90104675292969, "loss": 0.4794, "rewards/accuracies": 1.0, "rewards/chosen": 5.0626349449157715, "rewards/margins": 2.6539316177368164, "rewards/rejected": 2.408703327178955, "step": 2920 }, { "epoch": 0.65, "learning_rate": 7.910142004157762e-06, "logits/chosen": -1.7987825870513916, "logits/rejected": -1.743144154548645, "logps/chosen": -49.45542907714844, "logps/rejected": -37.497066497802734, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 3.2189528942108154, "rewards/margins": 3.5825259685516357, "rewards/rejected": -0.3635730743408203, "step": 2921 }, { "epoch": 0.65, "learning_rate": 7.90868435255765e-06, "logits/chosen": -1.77162766456604, "logits/rejected": -1.717017412185669, "logps/chosen": -173.55111694335938, "logps/rejected": -99.40879821777344, "loss": 0.44, "rewards/accuracies": 0.0, "rewards/chosen": 6.944433689117432, "rewards/margins": -0.19724559783935547, "rewards/rejected": 7.141679286956787, "step": 2922 }, { "epoch": 0.65, "learning_rate": 7.90722632719859e-06, "logits/chosen": -1.5035406351089478, "logits/rejected": -1.3729766607284546, "logps/chosen": -231.15090942382812, "logps/rejected": -45.95464324951172, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": 5.9262542724609375, "rewards/margins": 2.1562979221343994, "rewards/rejected": 3.769956350326538, "step": 2923 }, { "epoch": 0.65, "learning_rate": 7.905767928267936e-06, "logits/chosen": -1.512144923210144, "logits/rejected": -1.5158286094665527, "logps/chosen": -97.64561462402344, "logps/rejected": -103.81228637695312, "loss": 2.9326, "rewards/accuracies": 1.0, "rewards/chosen": 4.427098274230957, "rewards/margins": 0.5335037708282471, "rewards/rejected": 3.89359450340271, "step": 2924 }, { "epoch": 0.65, "learning_rate": 7.904309155953087e-06, "logits/chosen": -1.3875422477722168, "logits/rejected": -1.3283705711364746, "logps/chosen": -40.54884338378906, "logps/rejected": -44.680015563964844, "loss": 0.2172, "rewards/accuracies": 1.0, "rewards/chosen": 4.844624996185303, "rewards/margins": 0.8957724571228027, "rewards/rejected": 3.9488525390625, "step": 2925 }, { "epoch": 0.65, "learning_rate": 7.902850010441494e-06, "logits/chosen": -1.54372239112854, "logits/rejected": -1.4333479404449463, "logps/chosen": -73.39320373535156, "logps/rejected": -19.33612060546875, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": 4.256659030914307, "rewards/margins": 3.5948679447174072, "rewards/rejected": 0.6617910265922546, "step": 2926 }, { "epoch": 0.65, "learning_rate": 7.901390491920655e-06, "logits/chosen": -1.933781385421753, "logits/rejected": -1.486553430557251, "logps/chosen": -40.541717529296875, "logps/rejected": -88.40168762207031, "loss": 1.2549, "rewards/accuracies": 0.0, "rewards/chosen": 2.123927354812622, "rewards/margins": -2.410517930984497, "rewards/rejected": 4.534445285797119, "step": 2927 }, { "epoch": 0.65, "learning_rate": 7.899930600578112e-06, "logits/chosen": -1.5751689672470093, "logits/rejected": -1.4919233322143555, "logps/chosen": -107.34579467773438, "logps/rejected": -51.94654083251953, "loss": 2.6936, "rewards/accuracies": 1.0, "rewards/chosen": 6.762795925140381, "rewards/margins": 2.4103918075561523, "rewards/rejected": 4.3524041175842285, "step": 2928 }, { "epoch": 0.65, "learning_rate": 7.898470336601456e-06, "logits/chosen": -1.4914295673370361, "logits/rejected": -1.4655239582061768, "logps/chosen": -42.02710723876953, "logps/rejected": -52.321998596191406, "loss": 0.4645, "rewards/accuracies": 0.0, "rewards/chosen": 2.131124973297119, "rewards/margins": -0.3107130527496338, "rewards/rejected": 2.441838026046753, "step": 2929 }, { "epoch": 0.65, "learning_rate": 7.897009700178331e-06, "logits/chosen": -1.4929841756820679, "logits/rejected": -1.434918999671936, "logps/chosen": -93.20756530761719, "logps/rejected": -40.603179931640625, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": 6.366302490234375, "rewards/margins": 1.6801481246948242, "rewards/rejected": 4.686154365539551, "step": 2930 }, { "epoch": 0.65, "learning_rate": 7.895548691496421e-06, "logits/chosen": -1.49153470993042, "logits/rejected": -1.363863229751587, "logps/chosen": -76.05863952636719, "logps/rejected": -12.200414657592773, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": 3.6347649097442627, "rewards/margins": 2.6174821853637695, "rewards/rejected": 1.0172828435897827, "step": 2931 }, { "epoch": 0.65, "learning_rate": 7.894087310743468e-06, "logits/chosen": -1.8605715036392212, "logits/rejected": -1.9433892965316772, "logps/chosen": -86.06930541992188, "logps/rejected": -128.49398803710938, "loss": 1.1699, "rewards/accuracies": 0.0, "rewards/chosen": 6.221479892730713, "rewards/margins": -1.7605791091918945, "rewards/rejected": 7.982059001922607, "step": 2932 }, { "epoch": 0.65, "learning_rate": 7.892625558107252e-06, "logits/chosen": -1.6055136919021606, "logits/rejected": -1.506941556930542, "logps/chosen": -53.79920196533203, "logps/rejected": -18.622577667236328, "loss": 0.7237, "rewards/accuracies": 1.0, "rewards/chosen": 2.942092180252075, "rewards/margins": 2.070094585418701, "rewards/rejected": 0.8719976544380188, "step": 2933 }, { "epoch": 0.65, "learning_rate": 7.891163433775605e-06, "logits/chosen": -1.516870141029358, "logits/rejected": -1.3799676895141602, "logps/chosen": -73.71548461914062, "logps/rejected": -21.777957916259766, "loss": 1.6126, "rewards/accuracies": 1.0, "rewards/chosen": 3.5445504188537598, "rewards/margins": 1.1077854633331299, "rewards/rejected": 2.43676495552063, "step": 2934 }, { "epoch": 0.65, "learning_rate": 7.889700937936408e-06, "logits/chosen": -1.6397738456726074, "logits/rejected": -1.5978260040283203, "logps/chosen": -148.65768432617188, "logps/rejected": -92.02124786376953, "loss": 0.1254, "rewards/accuracies": 1.0, "rewards/chosen": 7.932275295257568, "rewards/margins": 1.2783193588256836, "rewards/rejected": 6.653955936431885, "step": 2935 }, { "epoch": 0.65, "learning_rate": 7.888238070777586e-06, "logits/chosen": -1.7546286582946777, "logits/rejected": -1.684255599975586, "logps/chosen": -70.91799926757812, "logps/rejected": -63.08473205566406, "loss": 0.1181, "rewards/accuracies": 1.0, "rewards/chosen": 3.711186170578003, "rewards/margins": 1.6016616821289062, "rewards/rejected": 2.1095244884490967, "step": 2936 }, { "epoch": 0.65, "learning_rate": 7.886774832487116e-06, "logits/chosen": -1.3775898218154907, "logits/rejected": -1.3067795038223267, "logps/chosen": -62.786216735839844, "logps/rejected": -48.385215759277344, "loss": 0.5081, "rewards/accuracies": 1.0, "rewards/chosen": 5.150335788726807, "rewards/margins": 2.352682590484619, "rewards/rejected": 2.7976531982421875, "step": 2937 }, { "epoch": 0.65, "learning_rate": 7.885311223253018e-06, "logits/chosen": -1.3919769525527954, "logits/rejected": -1.4318257570266724, "logps/chosen": -107.04301452636719, "logps/rejected": -82.82283020019531, "loss": 2.2289, "rewards/accuracies": 0.0, "rewards/chosen": 3.0621628761291504, "rewards/margins": -4.414156913757324, "rewards/rejected": 7.476319789886475, "step": 2938 }, { "epoch": 0.65, "learning_rate": 7.883847243263366e-06, "logits/chosen": -1.6520740985870361, "logits/rejected": -1.6427416801452637, "logps/chosen": -46.161624908447266, "logps/rejected": -67.18624877929688, "loss": 0.2722, "rewards/accuracies": 1.0, "rewards/chosen": 2.7082622051239014, "rewards/margins": 0.34139513969421387, "rewards/rejected": 2.3668670654296875, "step": 2939 }, { "epoch": 0.65, "learning_rate": 7.882382892706273e-06, "logits/chosen": -1.5174509286880493, "logits/rejected": -1.3490989208221436, "logps/chosen": -118.76371002197266, "logps/rejected": -35.64424133300781, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 7.942437171936035, "rewards/margins": 4.911333084106445, "rewards/rejected": 3.031104326248169, "step": 2940 }, { "epoch": 0.65, "learning_rate": 7.88091817176991e-06, "logits/chosen": -1.797364354133606, "logits/rejected": -1.84866464138031, "logps/chosen": -11.484394073486328, "logps/rejected": -46.41368865966797, "loss": 1.013, "rewards/accuracies": 0.0, "rewards/chosen": 2.4103245735168457, "rewards/margins": -1.656184196472168, "rewards/rejected": 4.066508769989014, "step": 2941 }, { "epoch": 0.65, "learning_rate": 7.879453080642486e-06, "logits/chosen": -1.3859561681747437, "logits/rejected": -1.2039612531661987, "logps/chosen": -77.07697296142578, "logps/rejected": -39.24130630493164, "loss": 0.1463, "rewards/accuracies": 1.0, "rewards/chosen": 4.875242710113525, "rewards/margins": 1.105295181274414, "rewards/rejected": 3.7699475288391113, "step": 2942 }, { "epoch": 0.65, "learning_rate": 7.877987619512263e-06, "logits/chosen": -1.9418169260025024, "logits/rejected": -1.9433367252349854, "logps/chosen": -59.11589050292969, "logps/rejected": -63.52941131591797, "loss": 3.8546, "rewards/accuracies": 0.0, "rewards/chosen": 2.6509201526641846, "rewards/margins": -1.4499938488006592, "rewards/rejected": 4.100914001464844, "step": 2943 }, { "epoch": 0.65, "learning_rate": 7.87652178856755e-06, "logits/chosen": -1.675658941268921, "logits/rejected": -1.6154166460037231, "logps/chosen": -36.53874206542969, "logps/rejected": -40.751033782958984, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": 3.621246337890625, "rewards/margins": 2.139604091644287, "rewards/rejected": 1.4816421270370483, "step": 2944 }, { "epoch": 0.65, "learning_rate": 7.875055587996703e-06, "logits/chosen": -1.3426988124847412, "logits/rejected": -1.3007721900939941, "logps/chosen": -34.06538391113281, "logps/rejected": -123.40959930419922, "loss": 0.4289, "rewards/accuracies": 0.0, "rewards/chosen": 2.8654391765594482, "rewards/margins": -0.26262855529785156, "rewards/rejected": 3.1280677318573, "step": 2945 }, { "epoch": 0.65, "learning_rate": 7.873589017988124e-06, "logits/chosen": -1.8445039987564087, "logits/rejected": -1.8357713222503662, "logps/chosen": -67.5794448852539, "logps/rejected": -62.23176574707031, "loss": 1.1321, "rewards/accuracies": 0.0, "rewards/chosen": 3.9834160804748535, "rewards/margins": -2.1522951126098633, "rewards/rejected": 6.135711193084717, "step": 2946 }, { "epoch": 0.65, "learning_rate": 7.872122078730263e-06, "logits/chosen": -1.586060881614685, "logits/rejected": -1.5510916709899902, "logps/chosen": -41.620365142822266, "logps/rejected": -25.478242874145508, "loss": 0.4705, "rewards/accuracies": 0.0, "rewards/chosen": 2.0345356464385986, "rewards/margins": -0.3776986598968506, "rewards/rejected": 2.412234306335449, "step": 2947 }, { "epoch": 0.65, "learning_rate": 7.87065477041162e-06, "logits/chosen": -1.4419939517974854, "logits/rejected": -1.408003330230713, "logps/chosen": -20.280881881713867, "logps/rejected": -1.46156644821167, "loss": 1.0625, "rewards/accuracies": 1.0, "rewards/chosen": 1.0182474851608276, "rewards/margins": 0.2292022705078125, "rewards/rejected": 0.7890452146530151, "step": 2948 }, { "epoch": 0.65, "learning_rate": 7.86918709322074e-06, "logits/chosen": -1.3110930919647217, "logits/rejected": -1.3048593997955322, "logps/chosen": -43.08573913574219, "logps/rejected": -38.70826721191406, "loss": 0.7951, "rewards/accuracies": 0.0, "rewards/chosen": 3.873461961746216, "rewards/margins": -0.06659388542175293, "rewards/rejected": 3.9400558471679688, "step": 2949 }, { "epoch": 0.65, "learning_rate": 7.867719047346216e-06, "logits/chosen": -1.7589023113250732, "logits/rejected": -1.751537799835205, "logps/chosen": -51.33502197265625, "logps/rejected": -66.39556121826172, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 3.769873857498169, "rewards/margins": 1.8060028553009033, "rewards/rejected": 1.9638710021972656, "step": 2950 }, { "epoch": 0.65, "learning_rate": 7.86625063297669e-06, "logits/chosen": -1.6014446020126343, "logits/rejected": -1.5913130044937134, "logps/chosen": -62.25092315673828, "logps/rejected": -88.71112823486328, "loss": 0.6667, "rewards/accuracies": 0.0, "rewards/chosen": 2.991971731185913, "rewards/margins": -0.04886317253112793, "rewards/rejected": 3.040834903717041, "step": 2951 }, { "epoch": 0.65, "learning_rate": 7.864781850300844e-06, "logits/chosen": -1.5629594326019287, "logits/rejected": -1.4735690355300903, "logps/chosen": -44.25513458251953, "logps/rejected": -32.216190338134766, "loss": 1.0288, "rewards/accuracies": 0.0, "rewards/chosen": 1.1217354536056519, "rewards/margins": -1.7089802026748657, "rewards/rejected": 2.8307156562805176, "step": 2952 }, { "epoch": 0.65, "learning_rate": 7.863312699507419e-06, "logits/chosen": -1.9972286224365234, "logits/rejected": -1.9137073755264282, "logps/chosen": -63.37425231933594, "logps/rejected": -87.394287109375, "loss": 0.3385, "rewards/accuracies": 1.0, "rewards/chosen": 6.1125168800354, "rewards/margins": 1.0511274337768555, "rewards/rejected": 5.061389446258545, "step": 2953 }, { "epoch": 0.65, "learning_rate": 7.861843180785196e-06, "logits/chosen": -1.494918942451477, "logits/rejected": -1.3710873126983643, "logps/chosen": -44.382972717285156, "logps/rejected": -13.682657241821289, "loss": 0.9465, "rewards/accuracies": 1.0, "rewards/chosen": 3.3185861110687256, "rewards/margins": 2.3612873554229736, "rewards/rejected": 0.9572986960411072, "step": 2954 }, { "epoch": 0.65, "learning_rate": 7.860373294323002e-06, "logits/chosen": -1.4704246520996094, "logits/rejected": -1.4352829456329346, "logps/chosen": -57.84039306640625, "logps/rejected": -4.854061126708984, "loss": 0.6106, "rewards/accuracies": 1.0, "rewards/chosen": 2.654200792312622, "rewards/margins": 0.7642234563827515, "rewards/rejected": 1.8899773359298706, "step": 2955 }, { "epoch": 0.65, "learning_rate": 7.858903040309717e-06, "logits/chosen": -1.6024242639541626, "logits/rejected": -1.4616577625274658, "logps/chosen": -85.79319763183594, "logps/rejected": -71.30782318115234, "loss": 0.0564, "rewards/accuracies": 1.0, "rewards/chosen": 5.835737705230713, "rewards/margins": 4.098968505859375, "rewards/rejected": 1.7367690801620483, "step": 2956 }, { "epoch": 0.65, "learning_rate": 7.857432418934264e-06, "logits/chosen": -1.61184823513031, "logits/rejected": -1.5138429403305054, "logps/chosen": -174.4537353515625, "logps/rejected": -138.84170532226562, "loss": 0.4016, "rewards/accuracies": 1.0, "rewards/chosen": 6.368467807769775, "rewards/margins": 0.6113357543945312, "rewards/rejected": 5.757132053375244, "step": 2957 }, { "epoch": 0.65, "learning_rate": 7.855961430385615e-06, "logits/chosen": -1.5633964538574219, "logits/rejected": -1.378008246421814, "logps/chosen": -54.01643371582031, "logps/rejected": -16.111345291137695, "loss": 2.0535, "rewards/accuracies": 1.0, "rewards/chosen": 2.7590973377227783, "rewards/margins": 1.431639313697815, "rewards/rejected": 1.3274580240249634, "step": 2958 }, { "epoch": 0.65, "learning_rate": 7.854490074852784e-06, "logits/chosen": -1.4649097919464111, "logits/rejected": -1.389783501625061, "logps/chosen": -48.38056182861328, "logps/rejected": -41.55057144165039, "loss": 0.1239, "rewards/accuracies": 1.0, "rewards/chosen": 2.1804757118225098, "rewards/margins": 1.2731884717941284, "rewards/rejected": 0.9072872400283813, "step": 2959 }, { "epoch": 0.66, "learning_rate": 7.853018352524845e-06, "logits/chosen": -1.39292573928833, "logits/rejected": -1.3510048389434814, "logps/chosen": -59.862369537353516, "logps/rejected": -89.1681900024414, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": 4.225952625274658, "rewards/margins": 1.7772328853607178, "rewards/rejected": 2.4487197399139404, "step": 2960 }, { "epoch": 0.66, "learning_rate": 7.851546263590905e-06, "logits/chosen": -1.5050855875015259, "logits/rejected": -0.9818440079689026, "logps/chosen": -26.894119262695312, "logps/rejected": -49.797359466552734, "loss": 0.1538, "rewards/accuracies": 1.0, "rewards/chosen": 2.880634307861328, "rewards/margins": 1.0554866790771484, "rewards/rejected": 1.8251476287841797, "step": 2961 }, { "epoch": 0.66, "learning_rate": 7.850073808240125e-06, "logits/chosen": -1.3892889022827148, "logits/rejected": -1.324434518814087, "logps/chosen": -57.270286560058594, "logps/rejected": -45.53681945800781, "loss": 0.5904, "rewards/accuracies": 0.0, "rewards/chosen": 2.4704978466033936, "rewards/margins": -0.073394775390625, "rewards/rejected": 2.5438926219940186, "step": 2962 }, { "epoch": 0.66, "learning_rate": 7.84860098666171e-06, "logits/chosen": -1.4713842868804932, "logits/rejected": -1.3578336238861084, "logps/chosen": -44.1177978515625, "logps/rejected": -21.206993103027344, "loss": 0.4084, "rewards/accuracies": 1.0, "rewards/chosen": 2.4479339122772217, "rewards/margins": 0.13196253776550293, "rewards/rejected": 2.3159713745117188, "step": 2963 }, { "epoch": 0.66, "learning_rate": 7.847127799044918e-06, "logits/chosen": -1.2572317123413086, "logits/rejected": -1.2414166927337646, "logps/chosen": -5.361345291137695, "logps/rejected": -1.3915998935699463, "loss": 0.3666, "rewards/accuracies": 0.0, "rewards/chosen": 0.7643699645996094, "rewards/margins": -0.05537903308868408, "rewards/rejected": 0.8197489976882935, "step": 2964 }, { "epoch": 0.66, "learning_rate": 7.845654245579047e-06, "logits/chosen": -1.2721034288406372, "logits/rejected": -1.2725498676300049, "logps/chosen": -42.6069450378418, "logps/rejected": -51.31018829345703, "loss": 0.4175, "rewards/accuracies": 0.0, "rewards/chosen": 5.110092639923096, "rewards/margins": -0.23514604568481445, "rewards/rejected": 5.34523868560791, "step": 2965 }, { "epoch": 0.66, "learning_rate": 7.844180326453447e-06, "logits/chosen": -1.7262842655181885, "logits/rejected": -1.57297682762146, "logps/chosen": -60.69995880126953, "logps/rejected": -50.36151123046875, "loss": 0.1866, "rewards/accuracies": 1.0, "rewards/chosen": 4.6260809898376465, "rewards/margins": 3.820096015930176, "rewards/rejected": 0.8059849143028259, "step": 2966 }, { "epoch": 0.66, "learning_rate": 7.842706041857512e-06, "logits/chosen": -1.1865651607513428, "logits/rejected": -1.1865651607513428, "logps/chosen": -32.205535888671875, "logps/rejected": -32.205535888671875, "loss": 0.3719, "rewards/accuracies": 0.0, "rewards/chosen": 1.8365589380264282, "rewards/margins": 0.0, "rewards/rejected": 1.8365589380264282, "step": 2967 }, { "epoch": 0.66, "learning_rate": 7.841231391980687e-06, "logits/chosen": -1.5556448698043823, "logits/rejected": -1.5556448698043823, "logps/chosen": -67.893798828125, "logps/rejected": -67.893798828125, "loss": 0.4135, "rewards/accuracies": 0.0, "rewards/chosen": 2.73974609375, "rewards/margins": 0.0, "rewards/rejected": 2.73974609375, "step": 2968 }, { "epoch": 0.66, "learning_rate": 7.839756377012453e-06, "logits/chosen": -1.3808633089065552, "logits/rejected": -1.3808633089065552, "logps/chosen": -74.45191955566406, "logps/rejected": -74.45191955566406, "loss": 0.3631, "rewards/accuracies": 0.0, "rewards/chosen": 0.8245651125907898, "rewards/margins": 0.0, "rewards/rejected": 0.8245651125907898, "step": 2969 }, { "epoch": 0.66, "learning_rate": 7.838280997142355e-06, "logits/chosen": -1.5107260942459106, "logits/rejected": -1.5107260942459106, "logps/chosen": -37.915992736816406, "logps/rejected": -37.915992736816406, "loss": 0.3674, "rewards/accuracies": 0.0, "rewards/chosen": 2.9488556385040283, "rewards/margins": 0.0, "rewards/rejected": 2.9488556385040283, "step": 2970 }, { "epoch": 0.66, "learning_rate": 7.836805252559971e-06, "logits/chosen": -1.8876615762710571, "logits/rejected": -1.8435275554656982, "logps/chosen": -46.739479064941406, "logps/rejected": -75.50786590576172, "loss": 0.2895, "rewards/accuracies": 1.0, "rewards/chosen": 6.414279937744141, "rewards/margins": 0.4482235908508301, "rewards/rejected": 5.9660563468933105, "step": 2971 }, { "epoch": 0.66, "learning_rate": 7.83532914345493e-06, "logits/chosen": -1.5813279151916504, "logits/rejected": -1.5613293647766113, "logps/chosen": -63.03400421142578, "logps/rejected": -46.41057205200195, "loss": 0.1825, "rewards/accuracies": 1.0, "rewards/chosen": 3.1005845069885254, "rewards/margins": 1.0726277828216553, "rewards/rejected": 2.02795672416687, "step": 2972 }, { "epoch": 0.66, "learning_rate": 7.833852670016912e-06, "logits/chosen": -1.6772211790084839, "logits/rejected": -1.667272686958313, "logps/chosen": -43.983699798583984, "logps/rejected": -56.264892578125, "loss": 0.9011, "rewards/accuracies": 0.0, "rewards/chosen": 2.7460696697235107, "rewards/margins": -1.4099791049957275, "rewards/rejected": 4.156048774719238, "step": 2973 }, { "epoch": 0.66, "learning_rate": 7.832375832435637e-06, "logits/chosen": -1.8147472143173218, "logits/rejected": -1.7070109844207764, "logps/chosen": -91.20260620117188, "logps/rejected": -60.853153228759766, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 6.81386137008667, "rewards/margins": 3.8485724925994873, "rewards/rejected": 2.9652888774871826, "step": 2974 }, { "epoch": 0.66, "learning_rate": 7.830898630900877e-06, "logits/chosen": -1.6185901165008545, "logits/rejected": -1.6135307550430298, "logps/chosen": -77.91288757324219, "logps/rejected": -106.16610717773438, "loss": 1.5287, "rewards/accuracies": 0.0, "rewards/chosen": 3.4438118934631348, "rewards/margins": -2.98758602142334, "rewards/rejected": 6.431397914886475, "step": 2975 }, { "epoch": 0.66, "learning_rate": 7.829421065602448e-06, "logits/chosen": -1.4202024936676025, "logits/rejected": -1.495055913925171, "logps/chosen": -68.1943359375, "logps/rejected": -169.65573120117188, "loss": 1.1292, "rewards/accuracies": 0.0, "rewards/chosen": 4.828919887542725, "rewards/margins": -0.03493213653564453, "rewards/rejected": 4.863852024078369, "step": 2976 }, { "epoch": 0.66, "learning_rate": 7.827943136730214e-06, "logits/chosen": -1.619236707687378, "logits/rejected": -1.619236707687378, "logps/chosen": -58.669639587402344, "logps/rejected": -58.669639587402344, "loss": 0.3866, "rewards/accuracies": 0.0, "rewards/chosen": 5.402174472808838, "rewards/margins": 0.0, "rewards/rejected": 5.402174472808838, "step": 2977 }, { "epoch": 0.66, "learning_rate": 7.826464844474086e-06, "logits/chosen": -1.7612215280532837, "logits/rejected": -1.744956135749817, "logps/chosen": -95.77111053466797, "logps/rejected": -102.19047546386719, "loss": 0.45, "rewards/accuracies": 1.0, "rewards/chosen": 8.031020164489746, "rewards/margins": 2.9809603691101074, "rewards/rejected": 5.050059795379639, "step": 2978 }, { "epoch": 0.66, "learning_rate": 7.82498618902402e-06, "logits/chosen": -1.4515495300292969, "logits/rejected": -1.4820241928100586, "logps/chosen": -68.86906433105469, "logps/rejected": -99.47251892089844, "loss": 0.1356, "rewards/accuracies": 1.0, "rewards/chosen": 4.8839616775512695, "rewards/margins": 1.1711304187774658, "rewards/rejected": 3.7128312587738037, "step": 2979 }, { "epoch": 0.66, "learning_rate": 7.823507170570018e-06, "logits/chosen": -1.7916103601455688, "logits/rejected": -1.7587295770645142, "logps/chosen": -39.0023307800293, "logps/rejected": -39.351707458496094, "loss": 1.1078, "rewards/accuracies": 0.0, "rewards/chosen": 2.768455982208252, "rewards/margins": -0.1282045841217041, "rewards/rejected": 2.896660566329956, "step": 2980 }, { "epoch": 0.66, "learning_rate": 7.822027789302134e-06, "logits/chosen": -1.488893747329712, "logits/rejected": -1.480908989906311, "logps/chosen": -74.37345886230469, "logps/rejected": -74.40444946289062, "loss": 0.493, "rewards/accuracies": 0.0, "rewards/chosen": 2.438375949859619, "rewards/margins": -0.2789275646209717, "rewards/rejected": 2.717303514480591, "step": 2981 }, { "epoch": 0.66, "learning_rate": 7.820548045410462e-06, "logits/chosen": -1.4834319353103638, "logits/rejected": -1.4261664152145386, "logps/chosen": -48.21192932128906, "logps/rejected": -18.349178314208984, "loss": 1.1436, "rewards/accuracies": 1.0, "rewards/chosen": 6.0857157707214355, "rewards/margins": 3.809030771255493, "rewards/rejected": 2.2766849994659424, "step": 2982 }, { "epoch": 0.66, "learning_rate": 7.819067939085145e-06, "logits/chosen": -1.765330195426941, "logits/rejected": -1.643661379814148, "logps/chosen": -155.91575622558594, "logps/rejected": -36.45368194580078, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 6.608561992645264, "rewards/margins": 4.918939590454102, "rewards/rejected": 1.6896225214004517, "step": 2983 }, { "epoch": 0.66, "learning_rate": 7.817587470516378e-06, "logits/chosen": -1.6002788543701172, "logits/rejected": -1.5264363288879395, "logps/chosen": -104.25110626220703, "logps/rejected": -79.79611206054688, "loss": 0.2281, "rewards/accuracies": 1.0, "rewards/chosen": 6.784564971923828, "rewards/margins": 4.355317115783691, "rewards/rejected": 2.429248094558716, "step": 2984 }, { "epoch": 0.66, "learning_rate": 7.816106639894392e-06, "logits/chosen": -1.902032732963562, "logits/rejected": -1.8494305610656738, "logps/chosen": -34.25205612182617, "logps/rejected": -55.932228088378906, "loss": 0.3819, "rewards/accuracies": 0.0, "rewards/chosen": 2.4674580097198486, "rewards/margins": -0.04584693908691406, "rewards/rejected": 2.5133049488067627, "step": 2985 }, { "epoch": 0.66, "learning_rate": 7.814625447409474e-06, "logits/chosen": -1.580558180809021, "logits/rejected": -1.416010856628418, "logps/chosen": -150.9844207763672, "logps/rejected": -41.15362548828125, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": 8.984843254089355, "rewards/margins": 5.818960189819336, "rewards/rejected": 3.1658828258514404, "step": 2986 }, { "epoch": 0.66, "learning_rate": 7.813143893251951e-06, "logits/chosen": -1.5103310346603394, "logits/rejected": -1.4654042720794678, "logps/chosen": -64.95606231689453, "logps/rejected": -37.5213508605957, "loss": 0.5056, "rewards/accuracies": 0.0, "rewards/chosen": 2.3281326293945312, "rewards/margins": -0.5031063556671143, "rewards/rejected": 2.8312389850616455, "step": 2987 }, { "epoch": 0.66, "learning_rate": 7.811661977612202e-06, "logits/chosen": -1.5626816749572754, "logits/rejected": -1.5259150266647339, "logps/chosen": -38.74348449707031, "logps/rejected": -69.01498413085938, "loss": 0.3917, "rewards/accuracies": 1.0, "rewards/chosen": 3.40106201171875, "rewards/margins": 2.810594081878662, "rewards/rejected": 0.5904678702354431, "step": 2988 }, { "epoch": 0.66, "learning_rate": 7.810179700680646e-06, "logits/chosen": -1.7409018278121948, "logits/rejected": -1.6610273122787476, "logps/chosen": -74.53347778320312, "logps/rejected": -30.379627227783203, "loss": 0.3274, "rewards/accuracies": 1.0, "rewards/chosen": 3.06131911277771, "rewards/margins": 0.3482201099395752, "rewards/rejected": 2.7130990028381348, "step": 2989 }, { "epoch": 0.66, "learning_rate": 7.808697062647755e-06, "logits/chosen": -1.8117836713790894, "logits/rejected": -1.8015015125274658, "logps/chosen": -48.94038772583008, "logps/rejected": -27.944725036621094, "loss": 0.4208, "rewards/accuracies": 0.0, "rewards/chosen": 2.7136218547821045, "rewards/margins": -0.2536613941192627, "rewards/rejected": 2.967283248901367, "step": 2990 }, { "epoch": 0.66, "learning_rate": 7.807214063704042e-06, "logits/chosen": -1.2984261512756348, "logits/rejected": -1.264295220375061, "logps/chosen": -64.81008911132812, "logps/rejected": -26.17302703857422, "loss": 0.0995, "rewards/accuracies": 1.0, "rewards/chosen": 3.53515625, "rewards/margins": 2.0460338592529297, "rewards/rejected": 1.4891223907470703, "step": 2991 }, { "epoch": 0.66, "learning_rate": 7.805730704040072e-06, "logits/chosen": -1.875282645225525, "logits/rejected": -1.741215705871582, "logps/chosen": -142.24986267089844, "logps/rejected": -47.88877868652344, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": 5.963453769683838, "rewards/margins": 2.417266845703125, "rewards/rejected": 3.546186923980713, "step": 2992 }, { "epoch": 0.66, "learning_rate": 7.804246983846449e-06, "logits/chosen": -1.3416855335235596, "logits/rejected": -1.3519487380981445, "logps/chosen": -59.19447326660156, "logps/rejected": -59.06290817260742, "loss": 0.9342, "rewards/accuracies": 0.0, "rewards/chosen": 2.532705783843994, "rewards/margins": -0.08560681343078613, "rewards/rejected": 2.6183125972747803, "step": 2993 }, { "epoch": 0.66, "learning_rate": 7.802762903313831e-06, "logits/chosen": -1.613479495048523, "logits/rejected": -1.6194247007369995, "logps/chosen": -104.99334716796875, "logps/rejected": -98.87080383300781, "loss": 1.226, "rewards/accuracies": 1.0, "rewards/chosen": 6.596930027008057, "rewards/margins": 3.107161045074463, "rewards/rejected": 3.4897689819335938, "step": 2994 }, { "epoch": 0.66, "learning_rate": 7.80127846263292e-06, "logits/chosen": -1.6145906448364258, "logits/rejected": -1.581659197807312, "logps/chosen": -50.873836517333984, "logps/rejected": -60.62403869628906, "loss": 0.6963, "rewards/accuracies": 0.0, "rewards/chosen": 3.329024076461792, "rewards/margins": -0.7520797252655029, "rewards/rejected": 4.081103801727295, "step": 2995 }, { "epoch": 0.66, "learning_rate": 7.799793661994457e-06, "logits/chosen": -1.9433413743972778, "logits/rejected": -1.957021951675415, "logps/chosen": -102.54889678955078, "logps/rejected": -70.17095184326172, "loss": 0.078, "rewards/accuracies": 1.0, "rewards/chosen": 5.591095924377441, "rewards/margins": 1.7996277809143066, "rewards/rejected": 3.7914681434631348, "step": 2996 }, { "epoch": 0.66, "learning_rate": 7.79830850158924e-06, "logits/chosen": -1.658212661743164, "logits/rejected": -1.5880929231643677, "logps/chosen": -107.10517883300781, "logps/rejected": -35.7694091796875, "loss": 0.1403, "rewards/accuracies": 1.0, "rewards/chosen": 4.587037563323975, "rewards/margins": 1.6470792293548584, "rewards/rejected": 2.939958333969116, "step": 2997 }, { "epoch": 0.66, "learning_rate": 7.796822981608109e-06, "logits/chosen": -1.7119942903518677, "logits/rejected": -1.663334846496582, "logps/chosen": -69.25972747802734, "logps/rejected": -59.47410202026367, "loss": 1.7167, "rewards/accuracies": 1.0, "rewards/chosen": 5.237066745758057, "rewards/margins": 2.1965863704681396, "rewards/rejected": 3.040480375289917, "step": 2998 }, { "epoch": 0.66, "learning_rate": 7.795337102241948e-06, "logits/chosen": -1.7146440744400024, "logits/rejected": -1.5322701930999756, "logps/chosen": -98.41096496582031, "logps/rejected": -63.6135368347168, "loss": 2.2207, "rewards/accuracies": 1.0, "rewards/chosen": 6.802867412567139, "rewards/margins": 3.5803327560424805, "rewards/rejected": 3.222534656524658, "step": 2999 }, { "epoch": 0.66, "learning_rate": 7.793850863681688e-06, "logits/chosen": -1.859073519706726, "logits/rejected": -1.7919749021530151, "logps/chosen": -98.05561828613281, "logps/rejected": -174.16184997558594, "loss": 0.1093, "rewards/accuracies": 1.0, "rewards/chosen": 9.466241836547852, "rewards/margins": 1.478569507598877, "rewards/rejected": 7.987672328948975, "step": 3000 }, { "epoch": 0.66, "learning_rate": 7.79236426611831e-06, "logits/chosen": -1.5296874046325684, "logits/rejected": -1.4426039457321167, "logps/chosen": -49.984622955322266, "logps/rejected": -40.67826461791992, "loss": 0.5332, "rewards/accuracies": 0.0, "rewards/chosen": 3.180018186569214, "rewards/margins": -0.31145477294921875, "rewards/rejected": 3.4914729595184326, "step": 3001 }, { "epoch": 0.66, "learning_rate": 7.790877309742833e-06, "logits/chosen": -1.5940810441970825, "logits/rejected": -1.58617103099823, "logps/chosen": -69.91424560546875, "logps/rejected": -86.13490295410156, "loss": 1.5791, "rewards/accuracies": 0.0, "rewards/chosen": 2.9346420764923096, "rewards/margins": -3.09059739112854, "rewards/rejected": 6.02523946762085, "step": 3002 }, { "epoch": 0.66, "learning_rate": 7.789389994746334e-06, "logits/chosen": -1.6891871690750122, "logits/rejected": -1.663818359375, "logps/chosen": -88.3307876586914, "logps/rejected": -99.6058120727539, "loss": 1.8906, "rewards/accuracies": 0.0, "rewards/chosen": 4.513088226318359, "rewards/margins": -1.879748821258545, "rewards/rejected": 6.392837047576904, "step": 3003 }, { "epoch": 0.66, "learning_rate": 7.787902321319925e-06, "logits/chosen": -1.6485539674758911, "logits/rejected": -1.5515395402908325, "logps/chosen": -37.585227966308594, "logps/rejected": -78.26603698730469, "loss": 1.03, "rewards/accuracies": 0.0, "rewards/chosen": 2.9133477210998535, "rewards/margins": -1.043809413909912, "rewards/rejected": 3.9571571350097656, "step": 3004 }, { "epoch": 0.67, "learning_rate": 7.786414289654768e-06, "logits/chosen": -1.8015553951263428, "logits/rejected": -1.8015553951263428, "logps/chosen": -55.56072998046875, "logps/rejected": -55.56072998046875, "loss": 1.7466, "rewards/accuracies": 0.0, "rewards/chosen": 3.8924293518066406, "rewards/margins": 0.0, "rewards/rejected": 3.8924293518066406, "step": 3005 }, { "epoch": 0.67, "learning_rate": 7.784925899942075e-06, "logits/chosen": -1.3129693269729614, "logits/rejected": -1.3129693269729614, "logps/chosen": -11.266740798950195, "logps/rejected": -11.266740798950195, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.3207625150680542, "rewards/margins": 0.0, "rewards/rejected": 1.3207625150680542, "step": 3006 }, { "epoch": 0.67, "learning_rate": 7.7834371523731e-06, "logits/chosen": -1.7403639554977417, "logits/rejected": -1.5801395177841187, "logps/chosen": -75.76496887207031, "logps/rejected": -64.37384033203125, "loss": 1.0939, "rewards/accuracies": 1.0, "rewards/chosen": 7.15133810043335, "rewards/margins": 1.229665756225586, "rewards/rejected": 5.921672344207764, "step": 3007 }, { "epoch": 0.67, "learning_rate": 7.781948047139139e-06, "logits/chosen": -1.7859166860580444, "logits/rejected": -1.8036209344863892, "logps/chosen": -152.69818115234375, "logps/rejected": -128.23916625976562, "loss": 0.3064, "rewards/accuracies": 1.0, "rewards/chosen": 8.931597709655762, "rewards/margins": 1.9387450218200684, "rewards/rejected": 6.992852687835693, "step": 3008 }, { "epoch": 0.67, "learning_rate": 7.780458584431545e-06, "logits/chosen": -1.4384781122207642, "logits/rejected": -1.4197864532470703, "logps/chosen": -35.47119140625, "logps/rejected": -42.00045394897461, "loss": 0.5063, "rewards/accuracies": 0.0, "rewards/chosen": 3.0032546520233154, "rewards/margins": -0.5097386837005615, "rewards/rejected": 3.512993335723877, "step": 3009 }, { "epoch": 0.67, "learning_rate": 7.778968764441704e-06, "logits/chosen": -1.8349066972732544, "logits/rejected": -1.8271466493606567, "logps/chosen": -96.05521392822266, "logps/rejected": -116.73414611816406, "loss": 0.2593, "rewards/accuracies": 1.0, "rewards/chosen": 7.198641300201416, "rewards/margins": 0.41724634170532227, "rewards/rejected": 6.781394958496094, "step": 3010 }, { "epoch": 0.67, "learning_rate": 7.777478587361058e-06, "logits/chosen": -1.4616206884384155, "logits/rejected": -1.4496780633926392, "logps/chosen": -33.043277740478516, "logps/rejected": -37.4216194152832, "loss": 0.4984, "rewards/accuracies": 1.0, "rewards/chosen": 3.1738598346710205, "rewards/margins": 0.7247931957244873, "rewards/rejected": 2.449066638946533, "step": 3011 }, { "epoch": 0.67, "learning_rate": 7.775988053381092e-06, "logits/chosen": -1.4857479333877563, "logits/rejected": -1.5381759405136108, "logps/chosen": -81.00566864013672, "logps/rejected": -77.5623550415039, "loss": 3.2929, "rewards/accuracies": 0.0, "rewards/chosen": 2.2588157653808594, "rewards/margins": -6.482510566711426, "rewards/rejected": 8.741326332092285, "step": 3012 }, { "epoch": 0.67, "learning_rate": 7.774497162693333e-06, "logits/chosen": -1.6043649911880493, "logits/rejected": -1.5547820329666138, "logps/chosen": -44.52348709106445, "logps/rejected": -17.438343048095703, "loss": 0.7068, "rewards/accuracies": 1.0, "rewards/chosen": 2.381274938583374, "rewards/margins": 0.15226149559020996, "rewards/rejected": 2.229013442993164, "step": 3013 }, { "epoch": 0.67, "learning_rate": 7.773005915489358e-06, "logits/chosen": -1.9495930671691895, "logits/rejected": -1.945243000984192, "logps/chosen": -53.35763931274414, "logps/rejected": -82.27142333984375, "loss": 1.2631, "rewards/accuracies": 0.0, "rewards/chosen": 2.4848194122314453, "rewards/margins": -1.7132978439331055, "rewards/rejected": 4.198117256164551, "step": 3014 }, { "epoch": 0.67, "learning_rate": 7.77151431196079e-06, "logits/chosen": -1.759943962097168, "logits/rejected": -1.693576693534851, "logps/chosen": -56.4031867980957, "logps/rejected": -59.69866943359375, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 3.965763568878174, "rewards/margins": 2.209580659866333, "rewards/rejected": 1.7561829090118408, "step": 3015 }, { "epoch": 0.67, "learning_rate": 7.770022352299294e-06, "logits/chosen": -1.4525394439697266, "logits/rejected": -1.516837239265442, "logps/chosen": -92.13912963867188, "logps/rejected": -125.23084259033203, "loss": 2.7621, "rewards/accuracies": 0.0, "rewards/chosen": 4.4126129150390625, "rewards/margins": -5.261538505554199, "rewards/rejected": 9.674151420593262, "step": 3016 }, { "epoch": 0.67, "learning_rate": 7.768530036696585e-06, "logits/chosen": -1.5178966522216797, "logits/rejected": -1.5502698421478271, "logps/chosen": -47.252723693847656, "logps/rejected": -80.5881118774414, "loss": 1.3352, "rewards/accuracies": 0.0, "rewards/chosen": 2.325698137283325, "rewards/margins": -1.9691717624664307, "rewards/rejected": 4.294869899749756, "step": 3017 }, { "epoch": 0.67, "learning_rate": 7.767037365344422e-06, "logits/chosen": -1.5703908205032349, "logits/rejected": -1.6172846555709839, "logps/chosen": -82.35285949707031, "logps/rejected": -107.25407409667969, "loss": 2.4683, "rewards/accuracies": 0.0, "rewards/chosen": 7.080897808074951, "rewards/margins": -0.9837098121643066, "rewards/rejected": 8.064607620239258, "step": 3018 }, { "epoch": 0.67, "learning_rate": 7.76554433843461e-06, "logits/chosen": -1.5753520727157593, "logits/rejected": -1.5606797933578491, "logps/chosen": -59.32016372680664, "logps/rejected": -68.01585388183594, "loss": 0.8832, "rewards/accuracies": 0.0, "rewards/chosen": 3.0398716926574707, "rewards/margins": -1.353322982788086, "rewards/rejected": 4.393194675445557, "step": 3019 }, { "epoch": 0.67, "learning_rate": 7.764050956159e-06, "logits/chosen": -1.7408236265182495, "logits/rejected": -1.739351511001587, "logps/chosen": -60.642330169677734, "logps/rejected": -79.68817138671875, "loss": 0.624, "rewards/accuracies": 0.0, "rewards/chosen": 2.7562458515167236, "rewards/margins": -0.7363924980163574, "rewards/rejected": 3.492638349533081, "step": 3020 }, { "epoch": 0.67, "learning_rate": 7.762557218709484e-06, "logits/chosen": -1.5999113321304321, "logits/rejected": -1.545310378074646, "logps/chosen": -51.115440368652344, "logps/rejected": -62.69663619995117, "loss": 0.1777, "rewards/accuracies": 1.0, "rewards/chosen": 2.7880592346191406, "rewards/margins": 0.9990848302841187, "rewards/rejected": 1.788974404335022, "step": 3021 }, { "epoch": 0.67, "learning_rate": 7.761063126278006e-06, "logits/chosen": -1.6727252006530762, "logits/rejected": -1.6109123229980469, "logps/chosen": -120.37371826171875, "logps/rejected": -50.76197814941406, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 6.2638092041015625, "rewards/margins": 3.2726378440856934, "rewards/rejected": 2.991171360015869, "step": 3022 }, { "epoch": 0.67, "learning_rate": 7.759568679056554e-06, "logits/chosen": -1.7037427425384521, "logits/rejected": -1.6384166479110718, "logps/chosen": -109.63104248046875, "logps/rejected": -78.76100158691406, "loss": 0.4962, "rewards/accuracies": 1.0, "rewards/chosen": 5.886422634124756, "rewards/margins": 2.153104305267334, "rewards/rejected": 3.733318328857422, "step": 3023 }, { "epoch": 0.67, "learning_rate": 7.758073877237164e-06, "logits/chosen": -1.6307235956192017, "logits/rejected": -1.6307235956192017, "logps/chosen": -49.46571350097656, "logps/rejected": -49.46571350097656, "loss": 0.8393, "rewards/accuracies": 0.0, "rewards/chosen": 3.759169101715088, "rewards/margins": 0.0, "rewards/rejected": 3.759169101715088, "step": 3024 }, { "epoch": 0.67, "learning_rate": 7.756578721011908e-06, "logits/chosen": -1.5745277404785156, "logits/rejected": -1.575370192527771, "logps/chosen": -65.03878784179688, "logps/rejected": -36.837135314941406, "loss": 1.029, "rewards/accuracies": 0.0, "rewards/chosen": 1.7160362005233765, "rewards/margins": -0.31433236598968506, "rewards/rejected": 2.0303685665130615, "step": 3025 }, { "epoch": 0.67, "learning_rate": 7.755083210572914e-06, "logits/chosen": -1.8378537893295288, "logits/rejected": -1.8373360633850098, "logps/chosen": -61.31636047363281, "logps/rejected": -66.51146697998047, "loss": 0.2347, "rewards/accuracies": 1.0, "rewards/chosen": 3.827056884765625, "rewards/margins": 0.8049278259277344, "rewards/rejected": 3.0221290588378906, "step": 3026 }, { "epoch": 0.67, "learning_rate": 7.75358734611235e-06, "logits/chosen": -1.5853219032287598, "logits/rejected": -1.5344258546829224, "logps/chosen": -65.5254898071289, "logps/rejected": -36.254024505615234, "loss": 0.3452, "rewards/accuracies": 1.0, "rewards/chosen": 2.7600274085998535, "rewards/margins": 0.021244525909423828, "rewards/rejected": 2.7387828826904297, "step": 3027 }, { "epoch": 0.67, "learning_rate": 7.75209112782243e-06, "logits/chosen": -2.0020246505737305, "logits/rejected": -1.8667670488357544, "logps/chosen": -138.97329711914062, "logps/rejected": -13.765358924865723, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 5.755577087402344, "rewards/margins": 3.0729339122772217, "rewards/rejected": 2.682643175125122, "step": 3028 }, { "epoch": 0.67, "learning_rate": 7.75059455589542e-06, "logits/chosen": -1.7088207006454468, "logits/rejected": -1.7317321300506592, "logps/chosen": -108.2099838256836, "logps/rejected": -145.578125, "loss": 0.1795, "rewards/accuracies": 1.0, "rewards/chosen": 7.993012428283691, "rewards/margins": 0.868532657623291, "rewards/rejected": 7.1244797706604, "step": 3029 }, { "epoch": 0.67, "learning_rate": 7.749097630523618e-06, "logits/chosen": -1.9490394592285156, "logits/rejected": -1.885391116142273, "logps/chosen": -49.9408073425293, "logps/rejected": -20.880901336669922, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 3.607786178588867, "rewards/margins": 2.8168773651123047, "rewards/rejected": 0.7909088134765625, "step": 3030 }, { "epoch": 0.67, "learning_rate": 7.74760035189938e-06, "logits/chosen": -1.252485752105713, "logits/rejected": -1.2338604927062988, "logps/chosen": -11.43514633178711, "logps/rejected": -12.81461238861084, "loss": 1.4044, "rewards/accuracies": 0.0, "rewards/chosen": 0.7346517443656921, "rewards/margins": -2.550889015197754, "rewards/rejected": 3.285540819168091, "step": 3031 }, { "epoch": 0.67, "learning_rate": 7.746102720215102e-06, "logits/chosen": -1.536831021308899, "logits/rejected": -1.5771048069000244, "logps/chosen": -163.33689880371094, "logps/rejected": -124.29501342773438, "loss": 1.0333, "rewards/accuracies": 0.0, "rewards/chosen": 6.0727128982543945, "rewards/margins": -1.9255690574645996, "rewards/rejected": 7.998281955718994, "step": 3032 }, { "epoch": 0.67, "learning_rate": 7.744604735663227e-06, "logits/chosen": -1.87711763381958, "logits/rejected": -1.8483314514160156, "logps/chosen": -133.68226623535156, "logps/rejected": -95.55538940429688, "loss": 3.9992, "rewards/accuracies": 0.0, "rewards/chosen": 6.311557292938232, "rewards/margins": -4.099069118499756, "rewards/rejected": 10.410626411437988, "step": 3033 }, { "epoch": 0.67, "learning_rate": 7.74310639843624e-06, "logits/chosen": -1.6395177841186523, "logits/rejected": -1.5433082580566406, "logps/chosen": -82.51390075683594, "logps/rejected": -154.877197265625, "loss": 1.0323, "rewards/accuracies": 0.0, "rewards/chosen": 5.437361240386963, "rewards/margins": -1.350782871246338, "rewards/rejected": 6.788144111633301, "step": 3034 }, { "epoch": 0.67, "learning_rate": 7.741607708726675e-06, "logits/chosen": -1.6237070560455322, "logits/rejected": -1.6016093492507935, "logps/chosen": -56.33089065551758, "logps/rejected": -52.60957336425781, "loss": 0.4096, "rewards/accuracies": 0.0, "rewards/chosen": 3.4320881366729736, "rewards/margins": -0.22922325134277344, "rewards/rejected": 3.661311388015747, "step": 3035 }, { "epoch": 0.67, "learning_rate": 7.740108666727111e-06, "logits/chosen": -1.5492897033691406, "logits/rejected": -1.6880050897598267, "logps/chosen": -36.5820426940918, "logps/rejected": -78.20484924316406, "loss": 2.7909, "rewards/accuracies": 0.0, "rewards/chosen": 3.477679967880249, "rewards/margins": -4.681217193603516, "rewards/rejected": 8.158897399902344, "step": 3036 }, { "epoch": 0.67, "learning_rate": 7.73860927263017e-06, "logits/chosen": -1.5212820768356323, "logits/rejected": -1.3594913482666016, "logps/chosen": -135.83953857421875, "logps/rejected": -14.924699783325195, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 6.074487209320068, "rewards/margins": 4.888009548187256, "rewards/rejected": 1.186477541923523, "step": 3037 }, { "epoch": 0.67, "learning_rate": 7.73710952662852e-06, "logits/chosen": -1.7085644006729126, "logits/rejected": -1.7162725925445557, "logps/chosen": -132.62596130371094, "logps/rejected": -150.83392333984375, "loss": 4.6727, "rewards/accuracies": 0.0, "rewards/chosen": 1.3918198347091675, "rewards/margins": -9.195344924926758, "rewards/rejected": 10.587164878845215, "step": 3038 }, { "epoch": 0.67, "learning_rate": 7.735609428914878e-06, "logits/chosen": -1.549429178237915, "logits/rejected": -1.5562955141067505, "logps/chosen": -39.06983947753906, "logps/rejected": -43.53986740112305, "loss": 0.3081, "rewards/accuracies": 1.0, "rewards/chosen": 3.121325731277466, "rewards/margins": 0.25920915603637695, "rewards/rejected": 2.862116575241089, "step": 3039 }, { "epoch": 0.67, "learning_rate": 7.734108979681998e-06, "logits/chosen": -1.5288718938827515, "logits/rejected": -1.4470218420028687, "logps/chosen": -32.132301330566406, "logps/rejected": -8.61889362335205, "loss": 0.4765, "rewards/accuracies": 1.0, "rewards/chosen": 2.415806293487549, "rewards/margins": 1.7149782180786133, "rewards/rejected": 0.7008280754089355, "step": 3040 }, { "epoch": 0.67, "learning_rate": 7.732608179122689e-06, "logits/chosen": -1.6886755228042603, "logits/rejected": -1.7202978134155273, "logps/chosen": -76.45933532714844, "logps/rejected": -134.87820434570312, "loss": 0.8998, "rewards/accuracies": 0.0, "rewards/chosen": 6.546933174133301, "rewards/margins": -1.5973081588745117, "rewards/rejected": 8.144241333007812, "step": 3041 }, { "epoch": 0.67, "learning_rate": 7.731107027429797e-06, "logits/chosen": -2.0093376636505127, "logits/rejected": -1.9604458808898926, "logps/chosen": -53.91453552246094, "logps/rejected": -49.86030960083008, "loss": 0.144, "rewards/accuracies": 1.0, "rewards/chosen": 3.693598985671997, "rewards/margins": 1.3856868743896484, "rewards/rejected": 2.3079121112823486, "step": 3042 }, { "epoch": 0.67, "learning_rate": 7.729605524796215e-06, "logits/chosen": -1.6328139305114746, "logits/rejected": -1.5583819150924683, "logps/chosen": -125.43885803222656, "logps/rejected": -54.35005569458008, "loss": 1.1866, "rewards/accuracies": 1.0, "rewards/chosen": 5.400401592254639, "rewards/margins": 2.6225697994232178, "rewards/rejected": 2.777831792831421, "step": 3043 }, { "epoch": 0.67, "learning_rate": 7.728103671414889e-06, "logits/chosen": -1.5815649032592773, "logits/rejected": -1.5599604845046997, "logps/chosen": -63.33577346801758, "logps/rejected": -60.62354278564453, "loss": 1.0604, "rewards/accuracies": 0.0, "rewards/chosen": 3.135136127471924, "rewards/margins": -1.1534924507141113, "rewards/rejected": 4.288628578186035, "step": 3044 }, { "epoch": 0.67, "learning_rate": 7.726601467478796e-06, "logits/chosen": -1.482696294784546, "logits/rejected": -1.2966517210006714, "logps/chosen": -148.19219970703125, "logps/rejected": -36.04305648803711, "loss": 0.1803, "rewards/accuracies": 1.0, "rewards/chosen": 11.201959609985352, "rewards/margins": 7.581925868988037, "rewards/rejected": 3.6200337409973145, "step": 3045 }, { "epoch": 0.67, "learning_rate": 7.72509891318097e-06, "logits/chosen": -2.0090067386627197, "logits/rejected": -2.0305068492889404, "logps/chosen": -79.72846984863281, "logps/rejected": -131.52694702148438, "loss": 3.108, "rewards/accuracies": 0.0, "rewards/chosen": 2.0440597534179688, "rewards/margins": -3.9923110008239746, "rewards/rejected": 6.036370754241943, "step": 3046 }, { "epoch": 0.67, "learning_rate": 7.723596008714486e-06, "logits/chosen": -1.6032850742340088, "logits/rejected": -1.6134954690933228, "logps/chosen": -128.9569091796875, "logps/rejected": -62.81551742553711, "loss": 0.1186, "rewards/accuracies": 1.0, "rewards/chosen": 5.040762424468994, "rewards/margins": 1.391251802444458, "rewards/rejected": 3.649510622024536, "step": 3047 }, { "epoch": 0.67, "learning_rate": 7.722092754272462e-06, "logits/chosen": -1.2313072681427002, "logits/rejected": -1.196914792060852, "logps/chosen": -44.922218322753906, "logps/rejected": -73.90309143066406, "loss": 0.9678, "rewards/accuracies": 1.0, "rewards/chosen": 2.3504478931427, "rewards/margins": 0.6501563787460327, "rewards/rejected": 1.7002915143966675, "step": 3048 }, { "epoch": 0.67, "learning_rate": 7.720589150048062e-06, "logits/chosen": -1.5319608449935913, "logits/rejected": -1.4898998737335205, "logps/chosen": -50.09413528442383, "logps/rejected": -54.91168212890625, "loss": 1.1091, "rewards/accuracies": 0.0, "rewards/chosen": 2.740612506866455, "rewards/margins": -2.096492290496826, "rewards/rejected": 4.837104797363281, "step": 3049 }, { "epoch": 0.68, "learning_rate": 7.719085196234497e-06, "logits/chosen": -1.865407109260559, "logits/rejected": -1.6314685344696045, "logps/chosen": -94.86991119384766, "logps/rejected": -109.03781127929688, "loss": 0.0822, "rewards/accuracies": 1.0, "rewards/chosen": 4.639751434326172, "rewards/margins": 6.338039398193359, "rewards/rejected": -1.6982879638671875, "step": 3050 }, { "epoch": 0.68, "learning_rate": 7.71758089302502e-06, "logits/chosen": -1.3196228742599487, "logits/rejected": -1.1596944332122803, "logps/chosen": -73.91975402832031, "logps/rejected": -8.573186874389648, "loss": 2.9893, "rewards/accuracies": 1.0, "rewards/chosen": 2.8824660778045654, "rewards/margins": 1.8656786680221558, "rewards/rejected": 1.0167874097824097, "step": 3051 }, { "epoch": 0.68, "learning_rate": 7.71607624061293e-06, "logits/chosen": -1.8068842887878418, "logits/rejected": -1.7082548141479492, "logps/chosen": -53.512359619140625, "logps/rejected": -49.138824462890625, "loss": 0.6579, "rewards/accuracies": 1.0, "rewards/chosen": 3.546905517578125, "rewards/margins": 0.146240234375, "rewards/rejected": 3.400665283203125, "step": 3052 }, { "epoch": 0.68, "learning_rate": 7.714571239191575e-06, "logits/chosen": -1.8802775144577026, "logits/rejected": -1.8447513580322266, "logps/chosen": -63.87660217285156, "logps/rejected": -110.7171859741211, "loss": 0.8589, "rewards/accuracies": 0.0, "rewards/chosen": 3.5163185596466064, "rewards/margins": -1.4206697940826416, "rewards/rejected": 4.936988353729248, "step": 3053 }, { "epoch": 0.68, "learning_rate": 7.713065888954339e-06, "logits/chosen": -1.4432860612869263, "logits/rejected": -1.2734181880950928, "logps/chosen": -73.5009994506836, "logps/rejected": -23.16261100769043, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": 4.128338813781738, "rewards/margins": 3.684922695159912, "rewards/rejected": 0.443416029214859, "step": 3054 }, { "epoch": 0.68, "learning_rate": 7.711560190094659e-06, "logits/chosen": -1.5437566041946411, "logits/rejected": -1.4934388399124146, "logps/chosen": -48.086647033691406, "logps/rejected": -55.451019287109375, "loss": 0.2248, "rewards/accuracies": 1.0, "rewards/chosen": 4.275699138641357, "rewards/margins": 0.6146821975708008, "rewards/rejected": 3.6610169410705566, "step": 3055 }, { "epoch": 0.68, "learning_rate": 7.710054142806015e-06, "logits/chosen": -1.896858811378479, "logits/rejected": -1.8933899402618408, "logps/chosen": -48.65083312988281, "logps/rejected": -51.22129821777344, "loss": 0.1794, "rewards/accuracies": 1.0, "rewards/chosen": 4.323182106018066, "rewards/margins": 0.8467180728912354, "rewards/rejected": 3.476464033126831, "step": 3056 }, { "epoch": 0.68, "learning_rate": 7.708547747281925e-06, "logits/chosen": -1.4508185386657715, "logits/rejected": -1.4508185386657715, "logps/chosen": -32.83804702758789, "logps/rejected": -32.83804702758789, "loss": 0.4773, "rewards/accuracies": 0.0, "rewards/chosen": 2.122373580932617, "rewards/margins": 0.0, "rewards/rejected": 2.122373580932617, "step": 3057 }, { "epoch": 0.68, "learning_rate": 7.707041003715962e-06, "logits/chosen": -1.3526891469955444, "logits/rejected": -1.338400959968567, "logps/chosen": -129.97100830078125, "logps/rejected": -166.23753356933594, "loss": 3.0746, "rewards/accuracies": 0.0, "rewards/chosen": 4.1278581619262695, "rewards/margins": -6.129980087280273, "rewards/rejected": 10.257838249206543, "step": 3058 }, { "epoch": 0.68, "learning_rate": 7.70553391230174e-06, "logits/chosen": -1.7549690008163452, "logits/rejected": -1.7642853260040283, "logps/chosen": -87.90399932861328, "logps/rejected": -87.20983123779297, "loss": 0.3688, "rewards/accuracies": 0.0, "rewards/chosen": 4.613773345947266, "rewards/margins": -0.08331918716430664, "rewards/rejected": 4.697092533111572, "step": 3059 }, { "epoch": 0.68, "learning_rate": 7.704026473232912e-06, "logits/chosen": -1.5670467615127563, "logits/rejected": -1.5139350891113281, "logps/chosen": -76.23739624023438, "logps/rejected": -55.732154846191406, "loss": 0.2057, "rewards/accuracies": 1.0, "rewards/chosen": 5.02199125289917, "rewards/margins": 1.8413851261138916, "rewards/rejected": 3.1806061267852783, "step": 3060 }, { "epoch": 0.68, "learning_rate": 7.702518686703182e-06, "logits/chosen": -1.3280916213989258, "logits/rejected": -1.3259363174438477, "logps/chosen": -49.36310577392578, "logps/rejected": -37.226768493652344, "loss": 0.5299, "rewards/accuracies": 0.0, "rewards/chosen": 3.6587905883789062, "rewards/margins": -0.5796999931335449, "rewards/rejected": 4.238490581512451, "step": 3061 }, { "epoch": 0.68, "learning_rate": 7.701010552906298e-06, "logits/chosen": -1.4324833154678345, "logits/rejected": -1.4324833154678345, "logps/chosen": -8.421201705932617, "logps/rejected": -8.421201705932617, "loss": 0.5909, "rewards/accuracies": 0.0, "rewards/chosen": 1.0665172338485718, "rewards/margins": 0.0, "rewards/rejected": 1.0665172338485718, "step": 3062 }, { "epoch": 0.68, "learning_rate": 7.699502072036051e-06, "logits/chosen": -1.429510474205017, "logits/rejected": -1.4004353284835815, "logps/chosen": -94.733642578125, "logps/rejected": -67.18032836914062, "loss": 0.7805, "rewards/accuracies": 0.0, "rewards/chosen": 1.6635849475860596, "rewards/margins": -1.3055357933044434, "rewards/rejected": 2.969120740890503, "step": 3063 }, { "epoch": 0.68, "learning_rate": 7.697993244286276e-06, "logits/chosen": -1.601397156715393, "logits/rejected": -1.5586979389190674, "logps/chosen": -77.80770874023438, "logps/rejected": -122.2079086303711, "loss": 1.2309, "rewards/accuracies": 0.0, "rewards/chosen": 3.86822509765625, "rewards/margins": -2.3565287590026855, "rewards/rejected": 6.2247538566589355, "step": 3064 }, { "epoch": 0.68, "learning_rate": 7.696484069850858e-06, "logits/chosen": -1.766904354095459, "logits/rejected": -1.707710862159729, "logps/chosen": -59.2837028503418, "logps/rejected": -57.46811294555664, "loss": 1.018, "rewards/accuracies": 1.0, "rewards/chosen": 2.9423434734344482, "rewards/margins": 0.030721187591552734, "rewards/rejected": 2.9116222858428955, "step": 3065 }, { "epoch": 0.68, "learning_rate": 7.694974548923717e-06, "logits/chosen": -1.4213862419128418, "logits/rejected": -1.5295658111572266, "logps/chosen": -175.61236572265625, "logps/rejected": -213.67050170898438, "loss": 1.0314, "rewards/accuracies": 0.0, "rewards/chosen": 8.500510215759277, "rewards/margins": -1.919769287109375, "rewards/rejected": 10.420279502868652, "step": 3066 }, { "epoch": 0.68, "learning_rate": 7.693464681698826e-06, "logits/chosen": -1.788224458694458, "logits/rejected": -1.797871470451355, "logps/chosen": -62.72299575805664, "logps/rejected": -41.9241943359375, "loss": 0.4467, "rewards/accuracies": 0.0, "rewards/chosen": 1.9672917127609253, "rewards/margins": -0.32133209705352783, "rewards/rejected": 2.288623809814453, "step": 3067 }, { "epoch": 0.68, "learning_rate": 7.691954468370198e-06, "logits/chosen": -1.7925770282745361, "logits/rejected": -1.6645588874816895, "logps/chosen": -123.2061996459961, "logps/rejected": -45.460304260253906, "loss": 0.4422, "rewards/accuracies": 1.0, "rewards/chosen": 8.666927337646484, "rewards/margins": 8.580633163452148, "rewards/rejected": 0.08629455417394638, "step": 3068 }, { "epoch": 0.68, "learning_rate": 7.69044390913189e-06, "logits/chosen": -1.763506293296814, "logits/rejected": -1.5335975885391235, "logps/chosen": -199.61495971679688, "logps/rejected": -50.04133224487305, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 7.711276531219482, "rewards/margins": 6.556551933288574, "rewards/rejected": 1.1547244787216187, "step": 3069 }, { "epoch": 0.68, "learning_rate": 7.688933004178009e-06, "logits/chosen": -1.5566391944885254, "logits/rejected": -1.578108787536621, "logps/chosen": -60.731781005859375, "logps/rejected": -44.830963134765625, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 1.796807885169983, "rewards/margins": -1.0974410772323608, "rewards/rejected": 2.8942489624023438, "step": 3070 }, { "epoch": 0.68, "learning_rate": 7.687421753702703e-06, "logits/chosen": -1.744451642036438, "logits/rejected": -1.7004097700119019, "logps/chosen": -63.38730239868164, "logps/rejected": -77.04963684082031, "loss": 1.2771, "rewards/accuracies": 0.0, "rewards/chosen": 2.5330045223236084, "rewards/margins": -2.2357723712921143, "rewards/rejected": 4.768776893615723, "step": 3071 }, { "epoch": 0.68, "learning_rate": 7.685910157900158e-06, "logits/chosen": -1.5952779054641724, "logits/rejected": -1.5038806200027466, "logps/chosen": -82.33448791503906, "logps/rejected": -46.353633880615234, "loss": 0.2729, "rewards/accuracies": 1.0, "rewards/chosen": 5.846615791320801, "rewards/margins": 0.6913785934448242, "rewards/rejected": 5.155237197875977, "step": 3072 }, { "epoch": 0.68, "learning_rate": 7.68439821696462e-06, "logits/chosen": -1.5890264511108398, "logits/rejected": -1.6416093111038208, "logps/chosen": -111.51289367675781, "logps/rejected": -192.69607543945312, "loss": 0.4154, "rewards/accuracies": 0.0, "rewards/chosen": 8.500445365905762, "rewards/margins": -0.19250202178955078, "rewards/rejected": 8.692947387695312, "step": 3073 }, { "epoch": 0.68, "learning_rate": 7.682885931090359e-06, "logits/chosen": -1.6860461235046387, "logits/rejected": -1.4870972633361816, "logps/chosen": -43.04693603515625, "logps/rejected": -75.48985290527344, "loss": 1.5612, "rewards/accuracies": 0.0, "rewards/chosen": 3.7903289794921875, "rewards/margins": -0.8682742118835449, "rewards/rejected": 4.658603191375732, "step": 3074 }, { "epoch": 0.68, "learning_rate": 7.681373300471706e-06, "logits/chosen": -1.5726969242095947, "logits/rejected": -1.5701963901519775, "logps/chosen": -45.560302734375, "logps/rejected": -51.76818084716797, "loss": 0.6804, "rewards/accuracies": 0.0, "rewards/chosen": 3.3082642555236816, "rewards/margins": -1.042447566986084, "rewards/rejected": 4.350711822509766, "step": 3075 }, { "epoch": 0.68, "learning_rate": 7.679860325303032e-06, "logits/chosen": -1.4217735528945923, "logits/rejected": -1.3442963361740112, "logps/chosen": -55.21292495727539, "logps/rejected": -39.65443420410156, "loss": 1.3998, "rewards/accuracies": 1.0, "rewards/chosen": 4.575338363647461, "rewards/margins": 0.2858591079711914, "rewards/rejected": 4.2894792556762695, "step": 3076 }, { "epoch": 0.68, "learning_rate": 7.678347005778746e-06, "logits/chosen": -1.3461190462112427, "logits/rejected": -1.2621277570724487, "logps/chosen": -29.32077407836914, "logps/rejected": -57.87962341308594, "loss": 0.2564, "rewards/accuracies": 1.0, "rewards/chosen": 2.622344732284546, "rewards/margins": 2.269287586212158, "rewards/rejected": 0.3530571162700653, "step": 3077 }, { "epoch": 0.68, "learning_rate": 7.67683334209331e-06, "logits/chosen": -1.5054610967636108, "logits/rejected": -1.550642967224121, "logps/chosen": -62.76934814453125, "logps/rejected": -50.50810241699219, "loss": 1.1038, "rewards/accuracies": 0.0, "rewards/chosen": 1.69011390209198, "rewards/margins": -1.3753775358200073, "rewards/rejected": 3.0654914379119873, "step": 3078 }, { "epoch": 0.68, "learning_rate": 7.675319334441225e-06, "logits/chosen": -1.7270939350128174, "logits/rejected": -1.5862531661987305, "logps/chosen": -107.38778686523438, "logps/rejected": -58.189979553222656, "loss": 1.0991, "rewards/accuracies": 1.0, "rewards/chosen": 3.9557526111602783, "rewards/margins": 0.4199957847595215, "rewards/rejected": 3.535756826400757, "step": 3079 }, { "epoch": 0.68, "learning_rate": 7.673804983017036e-06, "logits/chosen": -1.6424751281738281, "logits/rejected": -1.549306035041809, "logps/chosen": -79.41690063476562, "logps/rejected": -44.116455078125, "loss": 0.2397, "rewards/accuracies": 1.0, "rewards/chosen": 4.550387859344482, "rewards/margins": 1.5559725761413574, "rewards/rejected": 2.994415283203125, "step": 3080 }, { "epoch": 0.68, "learning_rate": 7.672290288015334e-06, "logits/chosen": -1.5207892656326294, "logits/rejected": -1.4960308074951172, "logps/chosen": -54.779762268066406, "logps/rejected": -53.81315994262695, "loss": 1.0693, "rewards/accuracies": 0.0, "rewards/chosen": 2.817485809326172, "rewards/margins": -1.956225872039795, "rewards/rejected": 4.773711681365967, "step": 3081 }, { "epoch": 0.68, "learning_rate": 7.670775249630755e-06, "logits/chosen": -1.465453028678894, "logits/rejected": -1.3918612003326416, "logps/chosen": -99.56242370605469, "logps/rejected": -92.73457336425781, "loss": 0.6221, "rewards/accuracies": 0.0, "rewards/chosen": 7.354504585266113, "rewards/margins": -0.7614574432373047, "rewards/rejected": 8.115962028503418, "step": 3082 }, { "epoch": 0.68, "learning_rate": 7.669259868057976e-06, "logits/chosen": -1.5983532667160034, "logits/rejected": -1.5849902629852295, "logps/chosen": -34.67653274536133, "logps/rejected": -17.525178909301758, "loss": 0.8103, "rewards/accuracies": 0.0, "rewards/chosen": 2.1248161792755127, "rewards/margins": -0.42412328720092773, "rewards/rejected": 2.5489394664764404, "step": 3083 }, { "epoch": 0.68, "learning_rate": 7.66774414349172e-06, "logits/chosen": -1.441169261932373, "logits/rejected": -1.441169261932373, "logps/chosen": -44.35295867919922, "logps/rejected": -44.35295867919922, "loss": 0.5821, "rewards/accuracies": 0.0, "rewards/chosen": 6.406313419342041, "rewards/margins": 0.0, "rewards/rejected": 6.406313419342041, "step": 3084 }, { "epoch": 0.68, "learning_rate": 7.666228076126755e-06, "logits/chosen": -1.4373345375061035, "logits/rejected": -1.4231737852096558, "logps/chosen": -70.27204895019531, "logps/rejected": -90.32020568847656, "loss": 0.3071, "rewards/accuracies": 1.0, "rewards/chosen": 6.52957010269165, "rewards/margins": 2.847933292388916, "rewards/rejected": 3.6816368103027344, "step": 3085 }, { "epoch": 0.68, "learning_rate": 7.66471166615789e-06, "logits/chosen": -1.1159677505493164, "logits/rejected": -1.0351427793502808, "logps/chosen": -73.86940002441406, "logps/rejected": -48.3973503112793, "loss": 0.1199, "rewards/accuracies": 1.0, "rewards/chosen": 3.3047072887420654, "rewards/margins": 1.3337352275848389, "rewards/rejected": 1.9709720611572266, "step": 3086 }, { "epoch": 0.68, "learning_rate": 7.663194913779985e-06, "logits/chosen": -1.495025396347046, "logits/rejected": -1.5422558784484863, "logps/chosen": -40.32098388671875, "logps/rejected": -79.07283020019531, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": 4.98415994644165, "rewards/margins": 2.325942277908325, "rewards/rejected": 2.658217668533325, "step": 3087 }, { "epoch": 0.68, "learning_rate": 7.661677819187935e-06, "logits/chosen": -1.4914348125457764, "logits/rejected": -1.442393183708191, "logps/chosen": -32.02886199951172, "logps/rejected": -14.483545303344727, "loss": 0.6486, "rewards/accuracies": 1.0, "rewards/chosen": 2.6355819702148438, "rewards/margins": 0.17158865928649902, "rewards/rejected": 2.4639933109283447, "step": 3088 }, { "epoch": 0.68, "learning_rate": 7.660160382576683e-06, "logits/chosen": -1.346599817276001, "logits/rejected": -1.4134238958358765, "logps/chosen": -66.46685028076172, "logps/rejected": -83.34609985351562, "loss": 3.4753, "rewards/accuracies": 0.0, "rewards/chosen": 3.093827962875366, "rewards/margins": -4.649843215942383, "rewards/rejected": 7.74367094039917, "step": 3089 }, { "epoch": 0.68, "learning_rate": 7.658642604141218e-06, "logits/chosen": -1.7888319492340088, "logits/rejected": -1.6579045057296753, "logps/chosen": -110.67018127441406, "logps/rejected": -39.08161926269531, "loss": 0.2005, "rewards/accuracies": 1.0, "rewards/chosen": 7.42739725112915, "rewards/margins": 4.639866828918457, "rewards/rejected": 2.7875306606292725, "step": 3090 }, { "epoch": 0.68, "learning_rate": 7.657124484076569e-06, "logits/chosen": -1.7770986557006836, "logits/rejected": -1.7071657180786133, "logps/chosen": -124.84159088134766, "logps/rejected": -47.92755126953125, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 6.752016544342041, "rewards/margins": 3.227569580078125, "rewards/rejected": 3.524446964263916, "step": 3091 }, { "epoch": 0.68, "learning_rate": 7.65560602257781e-06, "logits/chosen": -1.2757850885391235, "logits/rejected": -1.3044726848602295, "logps/chosen": -61.45519256591797, "logps/rejected": -55.71124267578125, "loss": 1.0315, "rewards/accuracies": 0.0, "rewards/chosen": 2.2911179065704346, "rewards/margins": -1.9215471744537354, "rewards/rejected": 4.21266508102417, "step": 3092 }, { "epoch": 0.68, "learning_rate": 7.65408721984006e-06, "logits/chosen": -1.6675117015838623, "logits/rejected": -1.6159160137176514, "logps/chosen": -62.45366668701172, "logps/rejected": -12.560528755187988, "loss": 1.7751, "rewards/accuracies": 1.0, "rewards/chosen": 4.236240386962891, "rewards/margins": 3.4947478771209717, "rewards/rejected": 0.7414925694465637, "step": 3093 }, { "epoch": 0.68, "learning_rate": 7.652568076058486e-06, "logits/chosen": -1.7574583292007446, "logits/rejected": -1.6846343278884888, "logps/chosen": -158.84194946289062, "logps/rejected": -83.08969116210938, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": 6.082228183746338, "rewards/margins": 3.138747453689575, "rewards/rejected": 2.9434807300567627, "step": 3094 }, { "epoch": 0.69, "learning_rate": 7.651048591428289e-06, "logits/chosen": -1.566901683807373, "logits/rejected": -1.5483136177062988, "logps/chosen": -79.69660186767578, "logps/rejected": -41.97611999511719, "loss": 0.5298, "rewards/accuracies": 0.0, "rewards/chosen": 2.084941864013672, "rewards/margins": -0.5922532081604004, "rewards/rejected": 2.6771950721740723, "step": 3095 }, { "epoch": 0.69, "learning_rate": 7.649528766144721e-06, "logits/chosen": -1.6792670488357544, "logits/rejected": -1.6602602005004883, "logps/chosen": -42.23284912109375, "logps/rejected": -49.5244140625, "loss": 0.4357, "rewards/accuracies": 0.0, "rewards/chosen": 3.169600009918213, "rewards/margins": -0.07885122299194336, "rewards/rejected": 3.2484512329101562, "step": 3096 }, { "epoch": 0.69, "learning_rate": 7.648008600403076e-06, "logits/chosen": -1.5163472890853882, "logits/rejected": -1.5490078926086426, "logps/chosen": -20.345043182373047, "logps/rejected": -37.63177490234375, "loss": 0.8424, "rewards/accuracies": 0.0, "rewards/chosen": 1.6402275562286377, "rewards/margins": -1.4039435386657715, "rewards/rejected": 3.044171094894409, "step": 3097 }, { "epoch": 0.69, "learning_rate": 7.64648809439869e-06, "logits/chosen": -1.5875787734985352, "logits/rejected": -1.6596847772598267, "logps/chosen": -86.1681900024414, "logps/rejected": -73.56189727783203, "loss": 1.0954, "rewards/accuracies": 0.0, "rewards/chosen": 2.897315263748169, "rewards/margins": -1.8742668628692627, "rewards/rejected": 4.771582126617432, "step": 3098 }, { "epoch": 0.69, "learning_rate": 7.644967248326948e-06, "logits/chosen": -1.670573115348816, "logits/rejected": -1.549620270729065, "logps/chosen": -290.64239501953125, "logps/rejected": -81.4153060913086, "loss": 2.4612, "rewards/accuracies": 0.0, "rewards/chosen": -4.23415994644165, "rewards/margins": -4.578105926513672, "rewards/rejected": 0.34394606947898865, "step": 3099 }, { "epoch": 0.69, "learning_rate": 7.643446062383273e-06, "logits/chosen": -1.3728479146957397, "logits/rejected": -1.3728479146957397, "logps/chosen": -84.57713317871094, "logps/rejected": -84.57713317871094, "loss": 0.4042, "rewards/accuracies": 0.0, "rewards/chosen": 2.6266915798187256, "rewards/margins": 0.0, "rewards/rejected": 2.6266915798187256, "step": 3100 }, { "epoch": 0.69, "learning_rate": 7.641924536763131e-06, "logits/chosen": -1.5604182481765747, "logits/rejected": -1.5558321475982666, "logps/chosen": -96.32951354980469, "logps/rejected": -95.1762466430664, "loss": 1.2143, "rewards/accuracies": 0.0, "rewards/chosen": 4.820404052734375, "rewards/margins": -1.8339042663574219, "rewards/rejected": 6.654308319091797, "step": 3101 }, { "epoch": 0.69, "learning_rate": 7.640402671662039e-06, "logits/chosen": -1.3941869735717773, "logits/rejected": -1.3584357500076294, "logps/chosen": -92.2717514038086, "logps/rejected": -64.71735382080078, "loss": 0.3256, "rewards/accuracies": 1.0, "rewards/chosen": 2.614919424057007, "rewards/margins": 0.10419774055480957, "rewards/rejected": 2.5107216835021973, "step": 3102 }, { "epoch": 0.69, "learning_rate": 7.638880467275552e-06, "logits/chosen": -1.7531791925430298, "logits/rejected": -1.7633624076843262, "logps/chosen": -30.95254135131836, "logps/rejected": -151.6119384765625, "loss": 2.8347, "rewards/accuracies": 0.0, "rewards/chosen": 2.4304885864257812, "rewards/margins": -5.536229133605957, "rewards/rejected": 7.966717720031738, "step": 3103 }, { "epoch": 0.69, "learning_rate": 7.637357923799267e-06, "logits/chosen": -1.509100317955017, "logits/rejected": -1.5222201347351074, "logps/chosen": -126.99581146240234, "logps/rejected": -71.24559020996094, "loss": 0.3453, "rewards/accuracies": 1.0, "rewards/chosen": 5.606353282928467, "rewards/margins": 0.058884620666503906, "rewards/rejected": 5.547468662261963, "step": 3104 }, { "epoch": 0.69, "learning_rate": 7.63583504142883e-06, "logits/chosen": -1.4188669919967651, "logits/rejected": -1.397499680519104, "logps/chosen": -51.729129791259766, "logps/rejected": -49.28032302856445, "loss": 3.1688, "rewards/accuracies": 0.0, "rewards/chosen": 2.045689821243286, "rewards/margins": -0.334200382232666, "rewards/rejected": 2.379890203475952, "step": 3105 }, { "epoch": 0.69, "learning_rate": 7.634311820359925e-06, "logits/chosen": -1.3573484420776367, "logits/rejected": -1.4827145338058472, "logps/chosen": -67.57809448242188, "logps/rejected": -147.82737731933594, "loss": 4.2281, "rewards/accuracies": 0.0, "rewards/chosen": 3.441600799560547, "rewards/margins": -6.552437782287598, "rewards/rejected": 9.994038581848145, "step": 3106 }, { "epoch": 0.69, "learning_rate": 7.632788260788285e-06, "logits/chosen": -1.623250961303711, "logits/rejected": -1.4941492080688477, "logps/chosen": -55.485740661621094, "logps/rejected": -30.465200424194336, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 2.9356117248535156, "rewards/margins": 1.3996480703353882, "rewards/rejected": 1.5359636545181274, "step": 3107 }, { "epoch": 0.69, "learning_rate": 7.631264362909683e-06, "logits/chosen": -1.4600756168365479, "logits/rejected": -1.4795243740081787, "logps/chosen": -47.54629898071289, "logps/rejected": -74.98206329345703, "loss": 1.2523, "rewards/accuracies": 1.0, "rewards/chosen": 4.670068264007568, "rewards/margins": 0.8359122276306152, "rewards/rejected": 3.834156036376953, "step": 3108 }, { "epoch": 0.69, "learning_rate": 7.629740126919934e-06, "logits/chosen": -1.5266623497009277, "logits/rejected": -1.5608793497085571, "logps/chosen": -135.0013885498047, "logps/rejected": -122.6473159790039, "loss": 3.13, "rewards/accuracies": 0.0, "rewards/chosen": 6.386409282684326, "rewards/margins": -2.0141472816467285, "rewards/rejected": 8.400556564331055, "step": 3109 }, { "epoch": 0.69, "learning_rate": 7.628215553014902e-06, "logits/chosen": -1.254076600074768, "logits/rejected": -1.286954641342163, "logps/chosen": -81.50555419921875, "logps/rejected": -50.544307708740234, "loss": 0.3726, "rewards/accuracies": 1.0, "rewards/chosen": 5.633404731750488, "rewards/margins": 3.224456548690796, "rewards/rejected": 2.4089481830596924, "step": 3110 }, { "epoch": 0.69, "learning_rate": 7.6266906413904885e-06, "logits/chosen": -1.3685100078582764, "logits/rejected": -1.3228527307510376, "logps/chosen": -28.331321716308594, "logps/rejected": -17.41886329650879, "loss": 0.1486, "rewards/accuracies": 1.0, "rewards/chosen": 4.082564830780029, "rewards/margins": 2.915505886077881, "rewards/rejected": 1.1670589447021484, "step": 3111 }, { "epoch": 0.69, "learning_rate": 7.625165392242642e-06, "logits/chosen": -1.5178687572479248, "logits/rejected": -1.538191556930542, "logps/chosen": -37.77143859863281, "logps/rejected": -29.31234359741211, "loss": 0.2635, "rewards/accuracies": 1.0, "rewards/chosen": 2.6142470836639404, "rewards/margins": 0.6368861198425293, "rewards/rejected": 1.9773609638214111, "step": 3112 }, { "epoch": 0.69, "learning_rate": 7.623639805767353e-06, "logits/chosen": -1.5156519412994385, "logits/rejected": -1.5182991027832031, "logps/chosen": -74.08781433105469, "logps/rejected": -101.56393432617188, "loss": 0.5998, "rewards/accuracies": 0.0, "rewards/chosen": 2.5980820655822754, "rewards/margins": -0.6447570323944092, "rewards/rejected": 3.2428390979766846, "step": 3113 }, { "epoch": 0.69, "learning_rate": 7.622113882160658e-06, "logits/chosen": -1.1698461771011353, "logits/rejected": -1.1698461771011353, "logps/chosen": -4.962527275085449, "logps/rejected": -4.962527275085449, "loss": 0.3516, "rewards/accuracies": 0.0, "rewards/chosen": 0.7654838562011719, "rewards/margins": 0.0, "rewards/rejected": 0.7654838562011719, "step": 3114 }, { "epoch": 0.69, "learning_rate": 7.620587621618632e-06, "logits/chosen": -1.166263222694397, "logits/rejected": -1.166263222694397, "logps/chosen": -27.517154693603516, "logps/rejected": -27.517154693603516, "loss": 1.4548, "rewards/accuracies": 0.0, "rewards/chosen": 2.7179081439971924, "rewards/margins": 0.0, "rewards/rejected": 2.7179081439971924, "step": 3115 }, { "epoch": 0.69, "learning_rate": 7.619061024337394e-06, "logits/chosen": -1.706673502922058, "logits/rejected": -1.6346486806869507, "logps/chosen": -147.98231506347656, "logps/rejected": -129.79653930664062, "loss": 1.1716, "rewards/accuracies": 0.0, "rewards/chosen": 7.501500129699707, "rewards/margins": -2.2405223846435547, "rewards/rejected": 9.742022514343262, "step": 3116 }, { "epoch": 0.69, "learning_rate": 7.617534090513112e-06, "logits/chosen": -1.391978144645691, "logits/rejected": -1.3129711151123047, "logps/chosen": -119.52654266357422, "logps/rejected": -136.99246215820312, "loss": 0.8652, "rewards/accuracies": 0.0, "rewards/chosen": 9.029261589050293, "rewards/margins": -0.5995855331420898, "rewards/rejected": 9.628847122192383, "step": 3117 }, { "epoch": 0.69, "learning_rate": 7.6160068203419914e-06, "logits/chosen": -1.392343521118164, "logits/rejected": -1.4652776718139648, "logps/chosen": -135.60342407226562, "logps/rejected": -178.46401977539062, "loss": 0.9557, "rewards/accuracies": 1.0, "rewards/chosen": 9.503097534179688, "rewards/margins": 1.3315553665161133, "rewards/rejected": 8.171542167663574, "step": 3118 }, { "epoch": 0.69, "learning_rate": 7.614479214020283e-06, "logits/chosen": -1.142195463180542, "logits/rejected": -1.1612436771392822, "logps/chosen": -29.563644409179688, "logps/rejected": -45.43492126464844, "loss": 0.5675, "rewards/accuracies": 0.0, "rewards/chosen": 1.6620739698410034, "rewards/margins": -0.5587390661239624, "rewards/rejected": 2.220813035964966, "step": 3119 }, { "epoch": 0.69, "learning_rate": 7.612951271744281e-06, "logits/chosen": -1.5794203281402588, "logits/rejected": -1.5738667249679565, "logps/chosen": -47.17658233642578, "logps/rejected": -38.1114387512207, "loss": 0.5984, "rewards/accuracies": 0.0, "rewards/chosen": 1.9035667181015015, "rewards/margins": -0.48919379711151123, "rewards/rejected": 2.3927605152130127, "step": 3120 }, { "epoch": 0.69, "learning_rate": 7.611422993710322e-06, "logits/chosen": -1.1918323040008545, "logits/rejected": -1.2135508060455322, "logps/chosen": -76.44944763183594, "logps/rejected": -127.2117919921875, "loss": 1.9787, "rewards/accuracies": 0.0, "rewards/chosen": 2.7795350551605225, "rewards/margins": -3.153001546859741, "rewards/rejected": 5.932536602020264, "step": 3121 }, { "epoch": 0.69, "learning_rate": 7.609894380114786e-06, "logits/chosen": -1.751495599746704, "logits/rejected": -1.7230778932571411, "logps/chosen": -73.98983764648438, "logps/rejected": -42.73836135864258, "loss": 0.1427, "rewards/accuracies": 1.0, "rewards/chosen": 3.9544312953948975, "rewards/margins": 1.1111698150634766, "rewards/rejected": 2.843261480331421, "step": 3122 }, { "epoch": 0.69, "learning_rate": 7.608365431154097e-06, "logits/chosen": -1.3428974151611328, "logits/rejected": -1.3074562549591064, "logps/chosen": -109.17693328857422, "logps/rejected": -47.26466369628906, "loss": 2.0364, "rewards/accuracies": 0.0, "rewards/chosen": 5.009256839752197, "rewards/margins": -0.2631072998046875, "rewards/rejected": 5.272364139556885, "step": 3123 }, { "epoch": 0.69, "learning_rate": 7.6068361470247195e-06, "logits/chosen": -1.520349144935608, "logits/rejected": -1.5393623113632202, "logps/chosen": -103.88689422607422, "logps/rejected": -78.47100830078125, "loss": 1.5459, "rewards/accuracies": 0.0, "rewards/chosen": 3.7384071350097656, "rewards/margins": -3.03664493560791, "rewards/rejected": 6.775052070617676, "step": 3124 }, { "epoch": 0.69, "learning_rate": 7.605306527923164e-06, "logits/chosen": -1.6850297451019287, "logits/rejected": -1.683840036392212, "logps/chosen": -27.193944931030273, "logps/rejected": -37.534271240234375, "loss": 0.7845, "rewards/accuracies": 0.0, "rewards/chosen": 2.5947329998016357, "rewards/margins": -1.126230239868164, "rewards/rejected": 3.7209632396698, "step": 3125 }, { "epoch": 0.69, "learning_rate": 7.603776574045983e-06, "logits/chosen": -1.4750407934188843, "logits/rejected": -1.4750407934188843, "logps/chosen": -70.45259094238281, "logps/rejected": -70.45259094238281, "loss": 1.7626, "rewards/accuracies": 0.0, "rewards/chosen": 3.9638023376464844, "rewards/margins": 0.0, "rewards/rejected": 3.9638023376464844, "step": 3126 }, { "epoch": 0.69, "learning_rate": 7.6022462855897695e-06, "logits/chosen": -1.440436840057373, "logits/rejected": -1.2736763954162598, "logps/chosen": -107.21456909179688, "logps/rejected": -47.641876220703125, "loss": 1.1142, "rewards/accuracies": 1.0, "rewards/chosen": 5.975335597991943, "rewards/margins": 4.01936149597168, "rewards/rejected": 1.9559738636016846, "step": 3127 }, { "epoch": 0.69, "learning_rate": 7.600715662751166e-06, "logits/chosen": -1.4315158128738403, "logits/rejected": -1.0139033794403076, "logps/chosen": -148.932861328125, "logps/rejected": -81.93132781982422, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 7.735543727874756, "rewards/margins": 2.2761497497558594, "rewards/rejected": 5.4593939781188965, "step": 3128 }, { "epoch": 0.69, "learning_rate": 7.599184705726851e-06, "logits/chosen": -1.7165809869766235, "logits/rejected": -1.6196297407150269, "logps/chosen": -49.569915771484375, "logps/rejected": -25.326297760009766, "loss": 0.5369, "rewards/accuracies": 1.0, "rewards/chosen": 3.277853488922119, "rewards/margins": 2.012087821960449, "rewards/rejected": 1.2657657861709595, "step": 3129 }, { "epoch": 0.69, "learning_rate": 7.597653414713551e-06, "logits/chosen": -1.4016106128692627, "logits/rejected": -1.3356990814208984, "logps/chosen": -56.13142395019531, "logps/rejected": -75.16632843017578, "loss": 0.5001, "rewards/accuracies": 0.0, "rewards/chosen": 3.0994651317596436, "rewards/margins": -0.5407516956329346, "rewards/rejected": 3.640216827392578, "step": 3130 }, { "epoch": 0.69, "learning_rate": 7.59612178990803e-06, "logits/chosen": -1.6762481927871704, "logits/rejected": -1.4725115299224854, "logps/chosen": -110.5382080078125, "logps/rejected": -28.358261108398438, "loss": 1.0313, "rewards/accuracies": 1.0, "rewards/chosen": 6.906994819641113, "rewards/margins": 5.52116584777832, "rewards/rejected": 1.3858288526535034, "step": 3131 }, { "epoch": 0.69, "learning_rate": 7.594589831507101e-06, "logits/chosen": -1.3982499837875366, "logits/rejected": -1.35574209690094, "logps/chosen": -40.86223602294922, "logps/rejected": -72.8831787109375, "loss": 1.2816, "rewards/accuracies": 0.0, "rewards/chosen": 2.79182505607605, "rewards/margins": -2.309798002243042, "rewards/rejected": 5.101623058319092, "step": 3132 }, { "epoch": 0.69, "learning_rate": 7.593057539707616e-06, "logits/chosen": -1.3914352655410767, "logits/rejected": -1.2818315029144287, "logps/chosen": -96.99874877929688, "logps/rejected": -65.79914855957031, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 7.210170269012451, "rewards/margins": 3.7314493656158447, "rewards/rejected": 3.4787209033966064, "step": 3133 }, { "epoch": 0.69, "learning_rate": 7.59152491470647e-06, "logits/chosen": -1.5013774633407593, "logits/rejected": -1.5013774633407593, "logps/chosen": -29.175975799560547, "logps/rejected": -29.175975799560547, "loss": 0.3623, "rewards/accuracies": 0.0, "rewards/chosen": 2.004638671875, "rewards/margins": 0.0, "rewards/rejected": 2.004638671875, "step": 3134 }, { "epoch": 0.69, "learning_rate": 7.589991956700602e-06, "logits/chosen": -1.995808482170105, "logits/rejected": -1.961082935333252, "logps/chosen": -74.26197052001953, "logps/rejected": -56.43547058105469, "loss": 0.4333, "rewards/accuracies": 0.0, "rewards/chosen": 3.344472646713257, "rewards/margins": -0.3182394504547119, "rewards/rejected": 3.6627120971679688, "step": 3135 }, { "epoch": 0.69, "learning_rate": 7.588458665886993e-06, "logits/chosen": -1.777525544166565, "logits/rejected": -1.7496216297149658, "logps/chosen": -141.6357421875, "logps/rejected": -170.580322265625, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": 7.1388092041015625, "rewards/margins": 2.2314820289611816, "rewards/rejected": 4.907327175140381, "step": 3136 }, { "epoch": 0.69, "learning_rate": 7.58692504246267e-06, "logits/chosen": -1.577009677886963, "logits/rejected": -1.4514013528823853, "logps/chosen": -64.3347396850586, "logps/rejected": -37.706573486328125, "loss": 0.3515, "rewards/accuracies": 1.0, "rewards/chosen": 1.157662272453308, "rewards/margins": 0.7954895496368408, "rewards/rejected": 0.3621726930141449, "step": 3137 }, { "epoch": 0.69, "learning_rate": 7.5853910866246964e-06, "logits/chosen": -1.8305267095565796, "logits/rejected": -1.8249201774597168, "logps/chosen": -82.00602722167969, "logps/rejected": -75.09319305419922, "loss": 1.2733, "rewards/accuracies": 0.0, "rewards/chosen": 2.987375020980835, "rewards/margins": -0.4041106700897217, "rewards/rejected": 3.3914856910705566, "step": 3138 }, { "epoch": 0.69, "learning_rate": 7.583856798570184e-06, "logits/chosen": -1.5771493911743164, "logits/rejected": -1.437548279762268, "logps/chosen": -152.365966796875, "logps/rejected": -48.41176986694336, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 6.873852729797363, "rewards/margins": 4.721893310546875, "rewards/rejected": 2.1519596576690674, "step": 3139 }, { "epoch": 0.69, "learning_rate": 7.582322178496284e-06, "logits/chosen": -1.6427199840545654, "logits/rejected": -1.577559232711792, "logps/chosen": -107.39247131347656, "logps/rejected": -38.822418212890625, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 6.074734687805176, "rewards/margins": 3.1717417240142822, "rewards/rejected": 2.9029929637908936, "step": 3140 }, { "epoch": 0.7, "learning_rate": 7.580787226600193e-06, "logits/chosen": -1.6919323205947876, "logits/rejected": -1.6783701181411743, "logps/chosen": -62.404685974121094, "logps/rejected": -59.7982177734375, "loss": 0.4576, "rewards/accuracies": 0.0, "rewards/chosen": 3.0905113220214844, "rewards/margins": -0.39995813369750977, "rewards/rejected": 3.490469455718994, "step": 3141 }, { "epoch": 0.7, "learning_rate": 7.579251943079145e-06, "logits/chosen": -1.288446307182312, "logits/rejected": -1.2963007688522339, "logps/chosen": -114.675048828125, "logps/rejected": -91.33143615722656, "loss": 0.1575, "rewards/accuracies": 1.0, "rewards/chosen": 7.846463203430176, "rewards/margins": 1.0652360916137695, "rewards/rejected": 6.781227111816406, "step": 3142 }, { "epoch": 0.7, "learning_rate": 7.577716328130425e-06, "logits/chosen": -1.1617485284805298, "logits/rejected": -1.1141762733459473, "logps/chosen": -68.45465087890625, "logps/rejected": -39.41622543334961, "loss": 0.9374, "rewards/accuracies": 0.0, "rewards/chosen": 1.7655800580978394, "rewards/margins": -1.2313541173934937, "rewards/rejected": 2.996934175491333, "step": 3143 }, { "epoch": 0.7, "learning_rate": 7.576180381951351e-06, "logits/chosen": -1.126622200012207, "logits/rejected": -1.0474830865859985, "logps/chosen": -59.93723678588867, "logps/rejected": -71.4622802734375, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 6.4204254150390625, "rewards/margins": 2.8631911277770996, "rewards/rejected": 3.557234287261963, "step": 3144 }, { "epoch": 0.7, "learning_rate": 7.574644104739293e-06, "logits/chosen": -1.344406008720398, "logits/rejected": -1.2267228364944458, "logps/chosen": -91.57308959960938, "logps/rejected": -50.547115325927734, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 5.45943021774292, "rewards/margins": 3.737579345703125, "rewards/rejected": 1.7218509912490845, "step": 3145 }, { "epoch": 0.7, "learning_rate": 7.573107496691656e-06, "logits/chosen": -1.754982590675354, "logits/rejected": -1.3205978870391846, "logps/chosen": -71.8188247680664, "logps/rejected": -90.7210922241211, "loss": 1.2197, "rewards/accuracies": 0.0, "rewards/chosen": 4.704783916473389, "rewards/margins": -2.0522098541259766, "rewards/rejected": 6.756993770599365, "step": 3146 }, { "epoch": 0.7, "learning_rate": 7.571570558005892e-06, "logits/chosen": -1.4512348175048828, "logits/rejected": -1.3285460472106934, "logps/chosen": -64.26519012451172, "logps/rejected": -48.46471405029297, "loss": 0.1517, "rewards/accuracies": 1.0, "rewards/chosen": 2.226278781890869, "rewards/margins": 1.184474229812622, "rewards/rejected": 1.041804552078247, "step": 3147 }, { "epoch": 0.7, "learning_rate": 7.570033288879493e-06, "logits/chosen": -1.6766009330749512, "logits/rejected": -1.3187693357467651, "logps/chosen": -56.52525329589844, "logps/rejected": -93.16209411621094, "loss": 0.6323, "rewards/accuracies": 1.0, "rewards/chosen": 3.4711456298828125, "rewards/margins": 0.8843796253204346, "rewards/rejected": 2.586766004562378, "step": 3148 }, { "epoch": 0.7, "learning_rate": 7.568495689509994e-06, "logits/chosen": -1.4250218868255615, "logits/rejected": -1.041090488433838, "logps/chosen": -60.6894645690918, "logps/rejected": -58.649444580078125, "loss": 1.6654, "rewards/accuracies": 0.0, "rewards/chosen": 2.074496030807495, "rewards/margins": -1.6859142780303955, "rewards/rejected": 3.7604103088378906, "step": 3149 }, { "epoch": 0.7, "learning_rate": 7.5669577600949725e-06, "logits/chosen": -1.8485769033432007, "logits/rejected": -1.8323805332183838, "logps/chosen": -87.34761047363281, "logps/rejected": -245.6292724609375, "loss": 2.0134, "rewards/accuracies": 0.0, "rewards/chosen": 5.4425859451293945, "rewards/margins": -4.003687858581543, "rewards/rejected": 9.446273803710938, "step": 3150 }, { "epoch": 0.7, "learning_rate": 7.565419500832052e-06, "logits/chosen": -1.6358835697174072, "logits/rejected": -1.5439120531082153, "logps/chosen": -70.54264831542969, "logps/rejected": -33.32450485229492, "loss": 0.3628, "rewards/accuracies": 1.0, "rewards/chosen": 3.2019264698028564, "rewards/margins": 0.4901275634765625, "rewards/rejected": 2.711798906326294, "step": 3151 }, { "epoch": 0.7, "learning_rate": 7.563880911918891e-06, "logits/chosen": -1.5144222974777222, "logits/rejected": -1.4941787719726562, "logps/chosen": -20.943939208984375, "logps/rejected": -13.303667068481445, "loss": 0.1435, "rewards/accuracies": 1.0, "rewards/chosen": 2.1861042976379395, "rewards/margins": 1.7281320095062256, "rewards/rejected": 0.45797234773635864, "step": 3152 }, { "epoch": 0.7, "learning_rate": 7.562341993553197e-06, "logits/chosen": -1.551095724105835, "logits/rejected": -1.3395025730133057, "logps/chosen": -120.17451477050781, "logps/rejected": -29.933391571044922, "loss": 1.1109, "rewards/accuracies": 1.0, "rewards/chosen": 5.381593227386475, "rewards/margins": 3.1862680912017822, "rewards/rejected": 2.1953251361846924, "step": 3153 }, { "epoch": 0.7, "learning_rate": 7.560802745932716e-06, "logits/chosen": -1.4409397840499878, "logits/rejected": -1.4428740739822388, "logps/chosen": -35.45085525512695, "logps/rejected": -10.976273536682129, "loss": 0.4958, "rewards/accuracies": 1.0, "rewards/chosen": 5.063244342803955, "rewards/margins": 3.6428723335266113, "rewards/rejected": 1.4203718900680542, "step": 3154 }, { "epoch": 0.7, "learning_rate": 7.559263169255238e-06, "logits/chosen": -1.9480631351470947, "logits/rejected": -1.9499657154083252, "logps/chosen": -60.86504364013672, "logps/rejected": -23.25539779663086, "loss": 1.5727, "rewards/accuracies": 1.0, "rewards/chosen": 7.768726348876953, "rewards/margins": 7.268472194671631, "rewards/rejected": 0.5002540946006775, "step": 3155 }, { "epoch": 0.7, "learning_rate": 7.557723263718596e-06, "logits/chosen": -1.6498222351074219, "logits/rejected": -1.6052883863449097, "logps/chosen": -73.68827056884766, "logps/rejected": -40.21775436401367, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 5.1801018714904785, "rewards/margins": 1.9569385051727295, "rewards/rejected": 3.223163366317749, "step": 3156 }, { "epoch": 0.7, "learning_rate": 7.556183029520663e-06, "logits/chosen": -1.3885220289230347, "logits/rejected": -1.3842613697052002, "logps/chosen": -44.502342224121094, "logps/rejected": -55.04248046875, "loss": 1.2434, "rewards/accuracies": 1.0, "rewards/chosen": 2.5178871154785156, "rewards/margins": 0.13854598999023438, "rewards/rejected": 2.3793411254882812, "step": 3157 }, { "epoch": 0.7, "learning_rate": 7.554642466859354e-06, "logits/chosen": -1.7208698987960815, "logits/rejected": -1.6744521856307983, "logps/chosen": -130.0763702392578, "logps/rejected": -109.85187530517578, "loss": 0.167, "rewards/accuracies": 1.0, "rewards/chosen": 5.780869960784912, "rewards/margins": 0.938072681427002, "rewards/rejected": 4.84279727935791, "step": 3158 }, { "epoch": 0.7, "learning_rate": 7.553101575932631e-06, "logits/chosen": -1.5452457666397095, "logits/rejected": -1.4920681715011597, "logps/chosen": -56.256561279296875, "logps/rejected": -55.518428802490234, "loss": 1.4797, "rewards/accuracies": 0.0, "rewards/chosen": 3.169293165206909, "rewards/margins": -0.5240726470947266, "rewards/rejected": 3.6933658123016357, "step": 3159 }, { "epoch": 0.7, "learning_rate": 7.551560356938492e-06, "logits/chosen": -1.7208845615386963, "logits/rejected": -1.6976643800735474, "logps/chosen": -111.55459594726562, "logps/rejected": -115.64273071289062, "loss": 1.0451, "rewards/accuracies": 0.0, "rewards/chosen": 7.203198432922363, "rewards/margins": -1.4836063385009766, "rewards/rejected": 8.68680477142334, "step": 3160 }, { "epoch": 0.7, "learning_rate": 7.550018810074982e-06, "logits/chosen": -1.6100642681121826, "logits/rejected": -1.5746530294418335, "logps/chosen": -32.460716247558594, "logps/rejected": -29.500835418701172, "loss": 0.6439, "rewards/accuracies": 0.0, "rewards/chosen": 1.314618706703186, "rewards/margins": -0.011123299598693848, "rewards/rejected": 1.3257420063018799, "step": 3161 }, { "epoch": 0.7, "learning_rate": 7.548476935540183e-06, "logits/chosen": -1.4354939460754395, "logits/rejected": -1.306261658668518, "logps/chosen": -102.53865051269531, "logps/rejected": -42.12892150878906, "loss": 0.0954, "rewards/accuracies": 1.0, "rewards/chosen": 5.0628509521484375, "rewards/margins": 2.2047102451324463, "rewards/rejected": 2.858140707015991, "step": 3162 }, { "epoch": 0.7, "learning_rate": 7.546934733532227e-06, "logits/chosen": -1.6929880380630493, "logits/rejected": -1.6786110401153564, "logps/chosen": -43.637451171875, "logps/rejected": -55.01128387451172, "loss": 0.4501, "rewards/accuracies": 0.0, "rewards/chosen": 2.5028083324432373, "rewards/margins": -0.12610173225402832, "rewards/rejected": 2.6289100646972656, "step": 3163 }, { "epoch": 0.7, "learning_rate": 7.5453922042492774e-06, "logits/chosen": -1.712817907333374, "logits/rejected": -1.7137569189071655, "logps/chosen": -38.935035705566406, "logps/rejected": -45.97199249267578, "loss": 1.9, "rewards/accuracies": 0.0, "rewards/chosen": 2.3326549530029297, "rewards/margins": -1.9803271293640137, "rewards/rejected": 4.312982082366943, "step": 3164 }, { "epoch": 0.7, "learning_rate": 7.5438493478895515e-06, "logits/chosen": -1.6607438325881958, "logits/rejected": -1.6228854656219482, "logps/chosen": -76.42353820800781, "logps/rejected": -133.8636474609375, "loss": 0.2692, "rewards/accuracies": 1.0, "rewards/chosen": 5.559788703918457, "rewards/margins": 0.41936492919921875, "rewards/rejected": 5.140423774719238, "step": 3165 }, { "epoch": 0.7, "learning_rate": 7.542306164651298e-06, "logits/chosen": -1.375292181968689, "logits/rejected": -1.1743102073669434, "logps/chosen": -69.84986877441406, "logps/rejected": -33.232906341552734, "loss": 0.9555, "rewards/accuracies": 1.0, "rewards/chosen": 2.8509292602539062, "rewards/margins": 2.1283679008483887, "rewards/rejected": 0.7225612998008728, "step": 3166 }, { "epoch": 0.7, "learning_rate": 7.540762654732814e-06, "logits/chosen": -1.2342926263809204, "logits/rejected": -1.208000898361206, "logps/chosen": -128.33026123046875, "logps/rejected": -90.35672760009766, "loss": 0.8618, "rewards/accuracies": 0.0, "rewards/chosen": 7.171395778656006, "rewards/margins": -0.9353299140930176, "rewards/rejected": 8.106725692749023, "step": 3167 }, { "epoch": 0.7, "learning_rate": 7.539218818332437e-06, "logits/chosen": -1.6832956075668335, "logits/rejected": -1.619419813156128, "logps/chosen": -40.71333312988281, "logps/rejected": -17.017364501953125, "loss": 0.1225, "rewards/accuracies": 1.0, "rewards/chosen": 3.097273349761963, "rewards/margins": 1.3796753883361816, "rewards/rejected": 1.7175979614257812, "step": 3168 }, { "epoch": 0.7, "learning_rate": 7.537674655648543e-06, "logits/chosen": -1.4732296466827393, "logits/rejected": -1.4859511852264404, "logps/chosen": -28.396615982055664, "logps/rejected": -37.954490661621094, "loss": 1.0207, "rewards/accuracies": 0.0, "rewards/chosen": 1.4878290891647339, "rewards/margins": -1.7676252126693726, "rewards/rejected": 3.2554543018341064, "step": 3169 }, { "epoch": 0.7, "learning_rate": 7.536130166879561e-06, "logits/chosen": -1.5683979988098145, "logits/rejected": -1.4967256784439087, "logps/chosen": -55.54402160644531, "logps/rejected": -79.96556091308594, "loss": 0.734, "rewards/accuracies": 0.0, "rewards/chosen": 1.632637858390808, "rewards/margins": -0.44465339183807373, "rewards/rejected": 2.077291250228882, "step": 3170 }, { "epoch": 0.7, "learning_rate": 7.534585352223946e-06, "logits/chosen": -1.5333993434906006, "logits/rejected": -1.4129090309143066, "logps/chosen": -67.62869262695312, "logps/rejected": -78.79692077636719, "loss": 1.7352, "rewards/accuracies": 0.0, "rewards/chosen": 3.7579567432403564, "rewards/margins": -0.19675755500793457, "rewards/rejected": 3.954714298248291, "step": 3171 }, { "epoch": 0.7, "learning_rate": 7.533040211880207e-06, "logits/chosen": -1.6615214347839355, "logits/rejected": -1.3503994941711426, "logps/chosen": -46.55073928833008, "logps/rejected": -38.66421127319336, "loss": 0.4535, "rewards/accuracies": 1.0, "rewards/chosen": 3.4293179512023926, "rewards/margins": 1.4335293769836426, "rewards/rejected": 1.99578857421875, "step": 3172 }, { "epoch": 0.7, "learning_rate": 7.531494746046893e-06, "logits/chosen": -1.5807479619979858, "logits/rejected": -1.5679397583007812, "logps/chosen": -48.67262268066406, "logps/rejected": -48.438880920410156, "loss": 0.6511, "rewards/accuracies": 0.0, "rewards/chosen": 2.6614181995391846, "rewards/margins": -0.9531264305114746, "rewards/rejected": 3.614544630050659, "step": 3173 }, { "epoch": 0.7, "learning_rate": 7.529948954922586e-06, "logits/chosen": -1.1484861373901367, "logits/rejected": -1.1383501291275024, "logps/chosen": -11.282604217529297, "logps/rejected": -6.776405334472656, "loss": 0.4179, "rewards/accuracies": 0.0, "rewards/chosen": 1.0217630863189697, "rewards/margins": -0.023865103721618652, "rewards/rejected": 1.0456281900405884, "step": 3174 }, { "epoch": 0.7, "learning_rate": 7.528402838705921e-06, "logits/chosen": -1.649390459060669, "logits/rejected": -1.6127855777740479, "logps/chosen": -110.45099639892578, "logps/rejected": -58.08513641357422, "loss": 0.9647, "rewards/accuracies": 1.0, "rewards/chosen": 7.169692516326904, "rewards/margins": 2.344386577606201, "rewards/rejected": 4.825305938720703, "step": 3175 }, { "epoch": 0.7, "learning_rate": 7.526856397595569e-06, "logits/chosen": -1.7359097003936768, "logits/rejected": -0.8763019442558289, "logps/chosen": -115.42195892333984, "logps/rejected": -79.78987121582031, "loss": 0.19, "rewards/accuracies": 1.0, "rewards/chosen": 8.252457618713379, "rewards/margins": 2.695528507232666, "rewards/rejected": 5.556929111480713, "step": 3176 }, { "epoch": 0.7, "learning_rate": 7.525309631790244e-06, "logits/chosen": -1.4733353853225708, "logits/rejected": -1.3651121854782104, "logps/chosen": -49.23171615600586, "logps/rejected": -16.154489517211914, "loss": 2.7116, "rewards/accuracies": 1.0, "rewards/chosen": 2.6665151119232178, "rewards/margins": 2.261143922805786, "rewards/rejected": 0.4053712785243988, "step": 3177 }, { "epoch": 0.7, "learning_rate": 7.523762541488703e-06, "logits/chosen": -1.4822802543640137, "logits/rejected": -1.4822802543640137, "logps/chosen": -40.68986892700195, "logps/rejected": -40.68986892700195, "loss": 0.6816, "rewards/accuracies": 0.0, "rewards/chosen": 5.178194046020508, "rewards/margins": 0.0, "rewards/rejected": 5.178194046020508, "step": 3178 }, { "epoch": 0.7, "learning_rate": 7.522215126889742e-06, "logits/chosen": -1.814388394355774, "logits/rejected": -1.7948582172393799, "logps/chosen": -78.736572265625, "logps/rejected": -79.31072998046875, "loss": 0.3634, "rewards/accuracies": 1.0, "rewards/chosen": 5.187835693359375, "rewards/margins": 1.5064575672149658, "rewards/rejected": 3.681378126144409, "step": 3179 }, { "epoch": 0.7, "learning_rate": 7.5206673881922e-06, "logits/chosen": -1.5562043190002441, "logits/rejected": -1.4960931539535522, "logps/chosen": -124.85691833496094, "logps/rejected": -14.923126220703125, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 4.59956693649292, "rewards/margins": 4.231657028198242, "rewards/rejected": 0.3679099977016449, "step": 3180 }, { "epoch": 0.7, "learning_rate": 7.519119325594959e-06, "logits/chosen": -1.5831618309020996, "logits/rejected": -1.6847158670425415, "logps/chosen": -98.52404022216797, "logps/rejected": -117.23178100585938, "loss": 1.1717, "rewards/accuracies": 0.0, "rewards/chosen": 3.6238136291503906, "rewards/margins": -1.436988353729248, "rewards/rejected": 5.060801982879639, "step": 3181 }, { "epoch": 0.7, "learning_rate": 7.517570939296941e-06, "logits/chosen": -1.6334065198898315, "logits/rejected": -1.6434050798416138, "logps/chosen": -54.91435241699219, "logps/rejected": -75.01192474365234, "loss": 3.9638, "rewards/accuracies": 0.0, "rewards/chosen": 2.1532654762268066, "rewards/margins": -4.411545753479004, "rewards/rejected": 6.5648112297058105, "step": 3182 }, { "epoch": 0.7, "learning_rate": 7.516022229497109e-06, "logits/chosen": -1.6746845245361328, "logits/rejected": -1.4871646165847778, "logps/chosen": -71.10177612304688, "logps/rejected": -27.19285011291504, "loss": 0.9976, "rewards/accuracies": 1.0, "rewards/chosen": 4.045226573944092, "rewards/margins": 3.2120816707611084, "rewards/rejected": 0.8331449627876282, "step": 3183 }, { "epoch": 0.7, "learning_rate": 7.514473196394467e-06, "logits/chosen": -0.9836075901985168, "logits/rejected": -0.9836075901985168, "logps/chosen": -10.265792846679688, "logps/rejected": -10.265792846679688, "loss": 0.3711, "rewards/accuracies": 0.0, "rewards/chosen": 0.5153562426567078, "rewards/margins": 0.0, "rewards/rejected": 0.5153562426567078, "step": 3184 }, { "epoch": 0.7, "learning_rate": 7.512923840188066e-06, "logits/chosen": -1.488250494003296, "logits/rejected": -1.5974055528640747, "logps/chosen": -94.14236450195312, "logps/rejected": -98.06463623046875, "loss": 2.2104, "rewards/accuracies": 0.0, "rewards/chosen": 5.577641487121582, "rewards/margins": -1.8112244606018066, "rewards/rejected": 7.388865947723389, "step": 3185 }, { "epoch": 0.71, "learning_rate": 7.511374161076991e-06, "logits/chosen": -1.516619086265564, "logits/rejected": -1.516619086265564, "logps/chosen": -26.204782485961914, "logps/rejected": -26.204782485961914, "loss": 0.3509, "rewards/accuracies": 0.0, "rewards/chosen": 2.939965009689331, "rewards/margins": 0.0, "rewards/rejected": 2.939965009689331, "step": 3186 }, { "epoch": 0.71, "learning_rate": 7.509824159260373e-06, "logits/chosen": -1.677574872970581, "logits/rejected": -1.641235589981079, "logps/chosen": -136.02037048339844, "logps/rejected": -134.80325317382812, "loss": 1.0176, "rewards/accuracies": 0.0, "rewards/chosen": 5.630122661590576, "rewards/margins": -0.5372848510742188, "rewards/rejected": 6.167407512664795, "step": 3187 }, { "epoch": 0.71, "learning_rate": 7.5082738349373854e-06, "logits/chosen": -1.6767748594284058, "logits/rejected": -1.6084065437316895, "logps/chosen": -51.24071502685547, "logps/rejected": -65.64196014404297, "loss": 2.0354, "rewards/accuracies": 0.0, "rewards/chosen": 2.0815277099609375, "rewards/margins": -0.5771529674530029, "rewards/rejected": 2.6586806774139404, "step": 3188 }, { "epoch": 0.71, "learning_rate": 7.506723188307241e-06, "logits/chosen": -1.4250560998916626, "logits/rejected": -1.3979105949401855, "logps/chosen": -77.22442626953125, "logps/rejected": -63.92438507080078, "loss": 0.088, "rewards/accuracies": 1.0, "rewards/chosen": 5.935589790344238, "rewards/margins": 4.823990821838379, "rewards/rejected": 1.1115989685058594, "step": 3189 }, { "epoch": 0.71, "learning_rate": 7.50517221956919e-06, "logits/chosen": -1.5812644958496094, "logits/rejected": -1.5704859495162964, "logps/chosen": -59.44828414916992, "logps/rejected": -76.89115905761719, "loss": 0.8929, "rewards/accuracies": 0.0, "rewards/chosen": 1.8503360748291016, "rewards/margins": -1.5954480171203613, "rewards/rejected": 3.445784091949463, "step": 3190 }, { "epoch": 0.71, "learning_rate": 7.5036209289225325e-06, "logits/chosen": -1.5803767442703247, "logits/rejected": -1.5532630681991577, "logps/chosen": -88.52743530273438, "logps/rejected": -73.80204772949219, "loss": 1.559, "rewards/accuracies": 0.0, "rewards/chosen": 3.91876220703125, "rewards/margins": -1.0742554664611816, "rewards/rejected": 4.993017673492432, "step": 3191 }, { "epoch": 0.71, "learning_rate": 7.502069316566605e-06, "logits/chosen": -1.860244870185852, "logits/rejected": -1.860880732536316, "logps/chosen": -179.60525512695312, "logps/rejected": -175.71002197265625, "loss": 1.0371, "rewards/accuracies": 0.0, "rewards/chosen": 9.99988079071045, "rewards/margins": -0.7218112945556641, "rewards/rejected": 10.721692085266113, "step": 3192 }, { "epoch": 0.71, "learning_rate": 7.500517382700785e-06, "logits/chosen": -2.047121047973633, "logits/rejected": -1.9703238010406494, "logps/chosen": -132.21420288085938, "logps/rejected": -90.66703796386719, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 6.499336242675781, "rewards/margins": 3.4345481395721436, "rewards/rejected": 3.0647881031036377, "step": 3193 }, { "epoch": 0.71, "learning_rate": 7.498965127524491e-06, "logits/chosen": -1.5156116485595703, "logits/rejected": -1.4583004713058472, "logps/chosen": -156.96463012695312, "logps/rejected": -53.787841796875, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": 6.809613227844238, "rewards/margins": 3.8744912147521973, "rewards/rejected": 2.935122013092041, "step": 3194 }, { "epoch": 0.71, "learning_rate": 7.497412551237185e-06, "logits/chosen": -1.4455538988113403, "logits/rejected": -1.4455538988113403, "logps/chosen": -53.94712829589844, "logps/rejected": -53.94712829589844, "loss": 1.3267, "rewards/accuracies": 0.0, "rewards/chosen": 3.7906570434570312, "rewards/margins": 0.0, "rewards/rejected": 3.7906570434570312, "step": 3195 }, { "epoch": 0.71, "learning_rate": 7.495859654038371e-06, "logits/chosen": -1.4820512533187866, "logits/rejected": -1.4820512533187866, "logps/chosen": -58.73798751831055, "logps/rejected": -58.73798751831055, "loss": 0.5085, "rewards/accuracies": 0.0, "rewards/chosen": 3.9019246101379395, "rewards/margins": 0.0, "rewards/rejected": 3.9019246101379395, "step": 3196 }, { "epoch": 0.71, "learning_rate": 7.4943064361275916e-06, "logits/chosen": -1.7469195127487183, "logits/rejected": -1.7115087509155273, "logps/chosen": -53.97629928588867, "logps/rejected": -66.91996765136719, "loss": 0.4219, "rewards/accuracies": 0.0, "rewards/chosen": 2.2854816913604736, "rewards/margins": -0.06423139572143555, "rewards/rejected": 2.349713087081909, "step": 3197 }, { "epoch": 0.71, "learning_rate": 7.492752897704432e-06, "logits/chosen": -1.6820143461227417, "logits/rejected": -1.6685413122177124, "logps/chosen": -99.37420654296875, "logps/rejected": -22.773380279541016, "loss": 0.8317, "rewards/accuracies": 1.0, "rewards/chosen": 4.981919765472412, "rewards/margins": 5.364912986755371, "rewards/rejected": -0.3829931318759918, "step": 3198 }, { "epoch": 0.71, "learning_rate": 7.491199038968515e-06, "logits/chosen": -1.6284891366958618, "logits/rejected": -1.5918909311294556, "logps/chosen": -105.40655517578125, "logps/rejected": -27.74767303466797, "loss": 0.716, "rewards/accuracies": 1.0, "rewards/chosen": 7.619726657867432, "rewards/margins": 1.8068976402282715, "rewards/rejected": 5.81282901763916, "step": 3199 }, { "epoch": 0.71, "learning_rate": 7.489644860119511e-06, "logits/chosen": -1.376964807510376, "logits/rejected": -1.399991750717163, "logps/chosen": -29.758522033691406, "logps/rejected": -35.11721420288086, "loss": 1.5789, "rewards/accuracies": 0.0, "rewards/chosen": 2.5282669067382812, "rewards/margins": -1.5202078819274902, "rewards/rejected": 4.0484747886657715, "step": 3200 }, { "epoch": 0.71, "learning_rate": 7.488090361357129e-06, "logits/chosen": -1.1352206468582153, "logits/rejected": -1.028441071510315, "logps/chosen": -52.67948532104492, "logps/rejected": -54.02079772949219, "loss": 0.0801, "rewards/accuracies": 1.0, "rewards/chosen": 4.09314489364624, "rewards/margins": 1.8183248043060303, "rewards/rejected": 2.27482008934021, "step": 3201 }, { "epoch": 0.71, "learning_rate": 7.486535542881116e-06, "logits/chosen": -1.5912549495697021, "logits/rejected": -1.6101146936416626, "logps/chosen": -71.88801574707031, "logps/rejected": -108.29411315917969, "loss": 2.9267, "rewards/accuracies": 0.0, "rewards/chosen": 5.114481449127197, "rewards/margins": -3.5910229682922363, "rewards/rejected": 8.705504417419434, "step": 3202 }, { "epoch": 0.71, "learning_rate": 7.4849804048912624e-06, "logits/chosen": -1.739423155784607, "logits/rejected": -1.6474699974060059, "logps/chosen": -89.04066467285156, "logps/rejected": -85.28488159179688, "loss": 1.1227, "rewards/accuracies": 1.0, "rewards/chosen": 8.280308723449707, "rewards/margins": 2.71795654296875, "rewards/rejected": 5.562352180480957, "step": 3203 }, { "epoch": 0.71, "learning_rate": 7.483424947587401e-06, "logits/chosen": -1.4935191869735718, "logits/rejected": -1.269563913345337, "logps/chosen": -98.13980865478516, "logps/rejected": -23.086362838745117, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 7.7166924476623535, "rewards/margins": 7.822679042816162, "rewards/rejected": -0.1059865951538086, "step": 3204 }, { "epoch": 0.71, "learning_rate": 7.481869171169404e-06, "logits/chosen": -1.6453725099563599, "logits/rejected": -1.5993061065673828, "logps/chosen": -41.246063232421875, "logps/rejected": -50.56077575683594, "loss": 2.5715, "rewards/accuracies": 1.0, "rewards/chosen": 2.8072609901428223, "rewards/margins": 1.3444260358810425, "rewards/rejected": 1.4628349542617798, "step": 3205 }, { "epoch": 0.71, "learning_rate": 7.480313075837185e-06, "logits/chosen": -1.7157548666000366, "logits/rejected": -1.7157548666000366, "logps/chosen": -47.31731414794922, "logps/rejected": -47.31731414794922, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 7.485301494598389, "rewards/margins": 0.0, "rewards/rejected": 7.485301494598389, "step": 3206 }, { "epoch": 0.71, "learning_rate": 7.478756661790698e-06, "logits/chosen": -1.715505838394165, "logits/rejected": -1.7038054466247559, "logps/chosen": -58.84077835083008, "logps/rejected": -63.22072982788086, "loss": 0.64, "rewards/accuracies": 0.0, "rewards/chosen": 2.983053207397461, "rewards/margins": -0.6617462635040283, "rewards/rejected": 3.6447994709014893, "step": 3207 }, { "epoch": 0.71, "learning_rate": 7.477199929229938e-06, "logits/chosen": -1.6313533782958984, "logits/rejected": -1.7137362957000732, "logps/chosen": -148.35552978515625, "logps/rejected": -123.6385269165039, "loss": 1.5139, "rewards/accuracies": 0.0, "rewards/chosen": 4.756927490234375, "rewards/margins": -2.851595401763916, "rewards/rejected": 7.608522891998291, "step": 3208 }, { "epoch": 0.71, "learning_rate": 7.475642878354943e-06, "logits/chosen": -1.717529296875, "logits/rejected": -1.6589585542678833, "logps/chosen": -142.33444213867188, "logps/rejected": -71.97737121582031, "loss": 0.6943, "rewards/accuracies": 1.0, "rewards/chosen": 5.818078517913818, "rewards/margins": 1.0198698043823242, "rewards/rejected": 4.798208713531494, "step": 3209 }, { "epoch": 0.71, "learning_rate": 7.474085509365789e-06, "logits/chosen": -1.799553632736206, "logits/rejected": -1.8022568225860596, "logps/chosen": -56.4622917175293, "logps/rejected": -88.63111114501953, "loss": 0.6733, "rewards/accuracies": 0.0, "rewards/chosen": 4.100226402282715, "rewards/margins": -0.23716306686401367, "rewards/rejected": 4.3373894691467285, "step": 3210 }, { "epoch": 0.71, "learning_rate": 7.472527822462594e-06, "logits/chosen": -1.7048629522323608, "logits/rejected": -1.5977815389633179, "logps/chosen": -106.27462768554688, "logps/rejected": -78.05520629882812, "loss": 1.5646, "rewards/accuracies": 1.0, "rewards/chosen": 8.841318130493164, "rewards/margins": 1.1532444953918457, "rewards/rejected": 7.688073635101318, "step": 3211 }, { "epoch": 0.71, "learning_rate": 7.470969817845518e-06, "logits/chosen": -1.1224981546401978, "logits/rejected": -1.0141047239303589, "logps/chosen": -26.262557983398438, "logps/rejected": -3.918536901473999, "loss": 0.1291, "rewards/accuracies": 1.0, "rewards/chosen": 2.044311285018921, "rewards/margins": 1.3322511911392212, "rewards/rejected": 0.7120600938796997, "step": 3212 }, { "epoch": 0.71, "learning_rate": 7.469411495714763e-06, "logits/chosen": -1.6249977350234985, "logits/rejected": -1.7491954565048218, "logps/chosen": -57.593894958496094, "logps/rejected": -109.06217193603516, "loss": 0.9965, "rewards/accuracies": 0.0, "rewards/chosen": 3.7597908973693848, "rewards/margins": -1.846449375152588, "rewards/rejected": 5.606240272521973, "step": 3213 }, { "epoch": 0.71, "learning_rate": 7.467852856270563e-06, "logits/chosen": -1.4509999752044678, "logits/rejected": -1.4509999752044678, "logps/chosen": -45.23786926269531, "logps/rejected": -45.23786926269531, "loss": 0.5469, "rewards/accuracies": 0.0, "rewards/chosen": 2.6506423950195312, "rewards/margins": 0.0, "rewards/rejected": 2.6506423950195312, "step": 3214 }, { "epoch": 0.71, "learning_rate": 7.466293899713206e-06, "logits/chosen": -1.4934792518615723, "logits/rejected": -1.4296101331710815, "logps/chosen": -60.60004425048828, "logps/rejected": -40.3392333984375, "loss": 0.8593, "rewards/accuracies": 1.0, "rewards/chosen": 2.6475303173065186, "rewards/margins": 0.02127218246459961, "rewards/rejected": 2.626258134841919, "step": 3215 }, { "epoch": 0.71, "learning_rate": 7.464734626243011e-06, "logits/chosen": -1.7680151462554932, "logits/rejected": -1.7112656831741333, "logps/chosen": -73.97689056396484, "logps/rejected": -69.45347595214844, "loss": 0.2424, "rewards/accuracies": 1.0, "rewards/chosen": 4.728946208953857, "rewards/margins": 0.9845247268676758, "rewards/rejected": 3.7444214820861816, "step": 3216 }, { "epoch": 0.71, "learning_rate": 7.463175036060341e-06, "logits/chosen": -1.5689976215362549, "logits/rejected": -1.4935762882232666, "logps/chosen": -88.91756439208984, "logps/rejected": -21.49553680419922, "loss": 0.0897, "rewards/accuracies": 1.0, "rewards/chosen": 7.969101905822754, "rewards/margins": 6.6510772705078125, "rewards/rejected": 1.3180245161056519, "step": 3217 }, { "epoch": 0.71, "learning_rate": 7.4616151293656014e-06, "logits/chosen": -1.4539234638214111, "logits/rejected": -1.4501639604568481, "logps/chosen": -83.063232421875, "logps/rejected": -117.10545349121094, "loss": 0.3555, "rewards/accuracies": 1.0, "rewards/chosen": 6.885312080383301, "rewards/margins": 0.11843442916870117, "rewards/rejected": 6.7668776512146, "step": 3218 }, { "epoch": 0.71, "learning_rate": 7.460054906359234e-06, "logits/chosen": -1.4981672763824463, "logits/rejected": -1.4800063371658325, "logps/chosen": -56.882896423339844, "logps/rejected": -61.24677276611328, "loss": 0.5364, "rewards/accuracies": 0.0, "rewards/chosen": 3.1420419216156006, "rewards/margins": -0.13567352294921875, "rewards/rejected": 3.2777154445648193, "step": 3219 }, { "epoch": 0.71, "learning_rate": 7.4584943672417265e-06, "logits/chosen": -1.89590585231781, "logits/rejected": -1.8666388988494873, "logps/chosen": -119.70134735107422, "logps/rejected": -75.84563446044922, "loss": 0.4899, "rewards/accuracies": 1.0, "rewards/chosen": 8.419085502624512, "rewards/margins": 5.76047945022583, "rewards/rejected": 2.6586060523986816, "step": 3220 }, { "epoch": 0.71, "learning_rate": 7.456933512213601e-06, "logits/chosen": -1.6747658252716064, "logits/rejected": -1.6010769605636597, "logps/chosen": -143.59475708007812, "logps/rejected": -75.2047119140625, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": 7.6386871337890625, "rewards/margins": 3.0146079063415527, "rewards/rejected": 4.62407922744751, "step": 3221 }, { "epoch": 0.71, "learning_rate": 7.455372341475428e-06, "logits/chosen": -1.684761881828308, "logits/rejected": -1.6078503131866455, "logps/chosen": -87.72637939453125, "logps/rejected": -81.69976806640625, "loss": 0.558, "rewards/accuracies": 1.0, "rewards/chosen": 5.601318359375, "rewards/margins": 1.877859354019165, "rewards/rejected": 3.723459005355835, "step": 3222 }, { "epoch": 0.71, "learning_rate": 7.45381085522781e-06, "logits/chosen": -1.6642298698425293, "logits/rejected": -1.6634211540222168, "logps/chosen": -64.18819427490234, "logps/rejected": -54.80326843261719, "loss": 0.6298, "rewards/accuracies": 0.0, "rewards/chosen": 2.0390663146972656, "rewards/margins": -0.9248147010803223, "rewards/rejected": 2.963881015777588, "step": 3223 }, { "epoch": 0.71, "learning_rate": 7.452249053671397e-06, "logits/chosen": -1.6525529623031616, "logits/rejected": -1.5995389223098755, "logps/chosen": -47.00048828125, "logps/rejected": -67.099365234375, "loss": 0.9762, "rewards/accuracies": 0.0, "rewards/chosen": 2.242175340652466, "rewards/margins": -1.7788245677947998, "rewards/rejected": 4.020999908447266, "step": 3224 }, { "epoch": 0.71, "learning_rate": 7.450686937006874e-06, "logits/chosen": -1.3662313222885132, "logits/rejected": -1.3093533515930176, "logps/chosen": -34.21630859375, "logps/rejected": -20.14834976196289, "loss": 0.1835, "rewards/accuracies": 1.0, "rewards/chosen": 3.435028076171875, "rewards/margins": 1.8809043169021606, "rewards/rejected": 1.5541237592697144, "step": 3225 }, { "epoch": 0.71, "learning_rate": 7.4491245054349716e-06, "logits/chosen": -1.3159044981002808, "logits/rejected": -1.3159044981002808, "logps/chosen": -89.40505981445312, "logps/rejected": -89.40505981445312, "loss": 0.4281, "rewards/accuracies": 0.0, "rewards/chosen": 1.70225989818573, "rewards/margins": 0.0, "rewards/rejected": 1.70225989818573, "step": 3226 }, { "epoch": 0.71, "learning_rate": 7.447561759156457e-06, "logits/chosen": -1.4976437091827393, "logits/rejected": -1.4827096462249756, "logps/chosen": -82.88314819335938, "logps/rejected": -59.186527252197266, "loss": 0.3597, "rewards/accuracies": 0.0, "rewards/chosen": 2.980821371078491, "rewards/margins": -0.04673266410827637, "rewards/rejected": 3.0275540351867676, "step": 3227 }, { "epoch": 0.71, "learning_rate": 7.445998698372141e-06, "logits/chosen": -1.6439889669418335, "logits/rejected": -1.677559494972229, "logps/chosen": -45.65043258666992, "logps/rejected": -91.88018798828125, "loss": 0.6142, "rewards/accuracies": 1.0, "rewards/chosen": 3.471287965774536, "rewards/margins": 0.04518246650695801, "rewards/rejected": 3.426105499267578, "step": 3228 }, { "epoch": 0.71, "learning_rate": 7.4444353232828715e-06, "logits/chosen": -1.487037181854248, "logits/rejected": -1.5426223278045654, "logps/chosen": -41.785465240478516, "logps/rejected": -82.3202896118164, "loss": 0.5687, "rewards/accuracies": 0.0, "rewards/chosen": 1.7626022100448608, "rewards/margins": -0.7491036653518677, "rewards/rejected": 2.5117058753967285, "step": 3229 }, { "epoch": 0.71, "learning_rate": 7.4428716340895404e-06, "logits/chosen": -1.445766806602478, "logits/rejected": -1.4868450164794922, "logps/chosen": -55.36445617675781, "logps/rejected": -51.25929641723633, "loss": 0.7043, "rewards/accuracies": 1.0, "rewards/chosen": 3.3289763927459717, "rewards/margins": 0.992711067199707, "rewards/rejected": 2.3362653255462646, "step": 3230 }, { "epoch": 0.72, "learning_rate": 7.441307630993077e-06, "logits/chosen": -1.6715633869171143, "logits/rejected": -1.5932224988937378, "logps/chosen": -42.77889633178711, "logps/rejected": -17.126750946044922, "loss": 0.5045, "rewards/accuracies": 1.0, "rewards/chosen": 1.4084380865097046, "rewards/margins": 0.30016517639160156, "rewards/rejected": 1.108272910118103, "step": 3231 }, { "epoch": 0.72, "learning_rate": 7.4397433141944494e-06, "logits/chosen": -1.8453928232192993, "logits/rejected": -1.809948444366455, "logps/chosen": -124.19406127929688, "logps/rejected": -102.07431030273438, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": 10.3558931350708, "rewards/margins": 3.044332981109619, "rewards/rejected": 7.311560153961182, "step": 3232 }, { "epoch": 0.72, "learning_rate": 7.438178683894674e-06, "logits/chosen": -1.6301740407943726, "logits/rejected": -1.565264105796814, "logps/chosen": -46.17409133911133, "logps/rejected": -23.426860809326172, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": 2.747631549835205, "rewards/margins": 2.341320753097534, "rewards/rejected": 0.4063108563423157, "step": 3233 }, { "epoch": 0.72, "learning_rate": 7.436613740294798e-06, "logits/chosen": -1.8362654447555542, "logits/rejected": -1.3850177526474, "logps/chosen": -66.02754974365234, "logps/rejected": -83.86394500732422, "loss": 0.3961, "rewards/accuracies": 1.0, "rewards/chosen": 3.5368385314941406, "rewards/margins": 0.333096981048584, "rewards/rejected": 3.2037415504455566, "step": 3234 }, { "epoch": 0.72, "learning_rate": 7.435048483595913e-06, "logits/chosen": -1.6436506509780884, "logits/rejected": -1.4720853567123413, "logps/chosen": -44.45391845703125, "logps/rejected": -150.42185974121094, "loss": 2.3141, "rewards/accuracies": 0.0, "rewards/chosen": 2.2306442260742188, "rewards/margins": -4.017712593078613, "rewards/rejected": 6.248356819152832, "step": 3235 }, { "epoch": 0.72, "learning_rate": 7.433482913999152e-06, "logits/chosen": -1.727420687675476, "logits/rejected": -1.6352214813232422, "logps/chosen": -109.65975189208984, "logps/rejected": -77.70074462890625, "loss": 0.9882, "rewards/accuracies": 0.0, "rewards/chosen": 3.475945234298706, "rewards/margins": -1.7981455326080322, "rewards/rejected": 5.274090766906738, "step": 3236 }, { "epoch": 0.72, "learning_rate": 7.431917031705686e-06, "logits/chosen": -1.6663938760757446, "logits/rejected": -1.6663938760757446, "logps/chosen": -96.28137969970703, "logps/rejected": -96.28137969970703, "loss": 0.3478, "rewards/accuracies": 0.0, "rewards/chosen": 8.927082061767578, "rewards/margins": 0.0, "rewards/rejected": 8.927082061767578, "step": 3237 }, { "epoch": 0.72, "learning_rate": 7.430350836916727e-06, "logits/chosen": -1.421518325805664, "logits/rejected": -1.4336214065551758, "logps/chosen": -49.2376708984375, "logps/rejected": -34.742774963378906, "loss": 1.7538, "rewards/accuracies": 0.0, "rewards/chosen": 2.442981719970703, "rewards/margins": -1.209010362625122, "rewards/rejected": 3.651992082595825, "step": 3238 }, { "epoch": 0.72, "learning_rate": 7.42878432983353e-06, "logits/chosen": -1.317724585533142, "logits/rejected": -1.317724585533142, "logps/chosen": -30.540040969848633, "logps/rejected": -30.540040969848633, "loss": 0.3566, "rewards/accuracies": 0.0, "rewards/chosen": 3.8929100036621094, "rewards/margins": 0.0, "rewards/rejected": 3.8929100036621094, "step": 3239 }, { "epoch": 0.72, "learning_rate": 7.427217510657383e-06, "logits/chosen": -1.2446233034133911, "logits/rejected": -1.1023122072219849, "logps/chosen": -41.988197326660156, "logps/rejected": -10.61522388458252, "loss": 0.4305, "rewards/accuracies": 1.0, "rewards/chosen": 3.1399223804473877, "rewards/margins": 2.404367208480835, "rewards/rejected": 0.7355551719665527, "step": 3240 }, { "epoch": 0.72, "learning_rate": 7.425650379589622e-06, "logits/chosen": -1.6686770915985107, "logits/rejected": -1.6748132705688477, "logps/chosen": -49.831146240234375, "logps/rejected": -56.296958923339844, "loss": 0.3614, "rewards/accuracies": 0.0, "rewards/chosen": 2.14311146736145, "rewards/margins": -0.01083683967590332, "rewards/rejected": 2.1539483070373535, "step": 3241 }, { "epoch": 0.72, "learning_rate": 7.424082936831618e-06, "logits/chosen": -1.6285265684127808, "logits/rejected": -1.6574333906173706, "logps/chosen": -86.54518127441406, "logps/rejected": -82.93051147460938, "loss": 1.446, "rewards/accuracies": 0.0, "rewards/chosen": 3.760517120361328, "rewards/margins": -1.6318764686584473, "rewards/rejected": 5.392393589019775, "step": 3242 }, { "epoch": 0.72, "learning_rate": 7.4225151825847855e-06, "logits/chosen": -1.6756588220596313, "logits/rejected": -1.6936897039413452, "logps/chosen": -69.655517578125, "logps/rejected": -64.81556701660156, "loss": 2.7774, "rewards/accuracies": 0.0, "rewards/chosen": 1.8771873712539673, "rewards/margins": -3.953733444213867, "rewards/rejected": 5.830920696258545, "step": 3243 }, { "epoch": 0.72, "learning_rate": 7.420947117050573e-06, "logits/chosen": -1.3429266214370728, "logits/rejected": -1.3178434371948242, "logps/chosen": -37.362648010253906, "logps/rejected": -57.32781982421875, "loss": 0.3494, "rewards/accuracies": 1.0, "rewards/chosen": 2.820746660232544, "rewards/margins": 0.7560043334960938, "rewards/rejected": 2.06474232673645, "step": 3244 }, { "epoch": 0.72, "learning_rate": 7.419378740430477e-06, "logits/chosen": -1.7709354162216187, "logits/rejected": -1.7332501411437988, "logps/chosen": -95.60848999023438, "logps/rejected": -73.8693618774414, "loss": 0.1322, "rewards/accuracies": 1.0, "rewards/chosen": 8.652478218078613, "rewards/margins": 1.2026495933532715, "rewards/rejected": 7.449828624725342, "step": 3245 }, { "epoch": 0.72, "learning_rate": 7.417810052926029e-06, "logits/chosen": -1.5306183099746704, "logits/rejected": -1.4964088201522827, "logps/chosen": -90.02351379394531, "logps/rejected": -99.16328430175781, "loss": 0.1034, "rewards/accuracies": 1.0, "rewards/chosen": 6.7693772315979, "rewards/margins": 2.227036952972412, "rewards/rejected": 4.542340278625488, "step": 3246 }, { "epoch": 0.72, "learning_rate": 7.416241054738801e-06, "logits/chosen": -1.6682320833206177, "logits/rejected": -1.5882865190505981, "logps/chosen": -130.27764892578125, "logps/rejected": -81.12161254882812, "loss": 0.1515, "rewards/accuracies": 1.0, "rewards/chosen": 7.874392986297607, "rewards/margins": 1.105992317199707, "rewards/rejected": 6.7684006690979, "step": 3247 }, { "epoch": 0.72, "learning_rate": 7.414671746070407e-06, "logits/chosen": -1.3797059059143066, "logits/rejected": -1.3797059059143066, "logps/chosen": -68.56128692626953, "logps/rejected": -68.56128692626953, "loss": 0.4292, "rewards/accuracies": 0.0, "rewards/chosen": 3.564891815185547, "rewards/margins": 0.0, "rewards/rejected": 3.564891815185547, "step": 3248 }, { "epoch": 0.72, "learning_rate": 7.413102127122498e-06, "logits/chosen": -1.4366343021392822, "logits/rejected": -1.4169467687606812, "logps/chosen": -60.42164611816406, "logps/rejected": -38.09739685058594, "loss": 1.5776, "rewards/accuracies": 0.0, "rewards/chosen": 1.1214447021484375, "rewards/margins": -2.036586046218872, "rewards/rejected": 3.1580307483673096, "step": 3249 }, { "epoch": 0.72, "learning_rate": 7.411532198096764e-06, "logits/chosen": -1.8467416763305664, "logits/rejected": -1.6979097127914429, "logps/chosen": -185.75912475585938, "logps/rejected": -65.6976089477539, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": 6.939885139465332, "rewards/margins": 3.679781436920166, "rewards/rejected": 3.260103702545166, "step": 3250 }, { "epoch": 0.72, "learning_rate": 7.409961959194942e-06, "logits/chosen": -1.4046580791473389, "logits/rejected": -1.194094181060791, "logps/chosen": -101.84835815429688, "logps/rejected": -49.06765365600586, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 8.975485801696777, "rewards/margins": 4.898870944976807, "rewards/rejected": 4.076614856719971, "step": 3251 }, { "epoch": 0.72, "learning_rate": 7.4083914106188025e-06, "logits/chosen": -1.7481111288070679, "logits/rejected": -1.7047700881958008, "logps/chosen": -72.04180145263672, "logps/rejected": -87.25135803222656, "loss": 0.143, "rewards/accuracies": 1.0, "rewards/chosen": 3.4283082485198975, "rewards/margins": 1.1824007034301758, "rewards/rejected": 2.2459075450897217, "step": 3252 }, { "epoch": 0.72, "learning_rate": 7.406820552570156e-06, "logits/chosen": -1.485251545906067, "logits/rejected": -1.332690954208374, "logps/chosen": -142.19566345214844, "logps/rejected": -57.35932540893555, "loss": 0.3797, "rewards/accuracies": 1.0, "rewards/chosen": 8.060908317565918, "rewards/margins": 4.69045352935791, "rewards/rejected": 3.370454788208008, "step": 3253 }, { "epoch": 0.72, "learning_rate": 7.405249385250854e-06, "logits/chosen": -1.7559130191802979, "logits/rejected": -1.729867696762085, "logps/chosen": -70.76374053955078, "logps/rejected": -62.197471618652344, "loss": 0.7764, "rewards/accuracies": 1.0, "rewards/chosen": 4.471949100494385, "rewards/margins": 1.4838485717773438, "rewards/rejected": 2.988100528717041, "step": 3254 }, { "epoch": 0.72, "learning_rate": 7.403677908862788e-06, "logits/chosen": -1.3475315570831299, "logits/rejected": -1.4174917936325073, "logps/chosen": -84.09104919433594, "logps/rejected": -149.5517578125, "loss": 2.8716, "rewards/accuracies": 0.0, "rewards/chosen": 2.0586090087890625, "rewards/margins": -3.080166816711426, "rewards/rejected": 5.138775825500488, "step": 3255 }, { "epoch": 0.72, "learning_rate": 7.402106123607887e-06, "logits/chosen": -1.6062791347503662, "logits/rejected": -1.5309048891067505, "logps/chosen": -114.99120330810547, "logps/rejected": -68.04984283447266, "loss": 0.0568, "rewards/accuracies": 1.0, "rewards/chosen": 5.025886058807373, "rewards/margins": 2.1220202445983887, "rewards/rejected": 2.9038658142089844, "step": 3256 }, { "epoch": 0.72, "learning_rate": 7.400534029688126e-06, "logits/chosen": -1.4619550704956055, "logits/rejected": -1.4926890134811401, "logps/chosen": -100.21717834472656, "logps/rejected": -103.25091552734375, "loss": 2.489, "rewards/accuracies": 0.0, "rewards/chosen": 5.554622173309326, "rewards/margins": -2.733394145965576, "rewards/rejected": 8.288016319274902, "step": 3257 }, { "epoch": 0.72, "learning_rate": 7.398961627305512e-06, "logits/chosen": -1.3485246896743774, "logits/rejected": -1.2517156600952148, "logps/chosen": -142.6422882080078, "logps/rejected": -89.50070190429688, "loss": 0.0822, "rewards/accuracies": 1.0, "rewards/chosen": 6.138679504394531, "rewards/margins": 1.944014072418213, "rewards/rejected": 4.194665431976318, "step": 3258 }, { "epoch": 0.72, "learning_rate": 7.397388916662096e-06, "logits/chosen": -1.4823940992355347, "logits/rejected": -1.4444303512573242, "logps/chosen": -64.1063461303711, "logps/rejected": -76.50015258789062, "loss": 0.7687, "rewards/accuracies": 0.0, "rewards/chosen": 3.575773000717163, "rewards/margins": -0.898306131362915, "rewards/rejected": 4.474079132080078, "step": 3259 }, { "epoch": 0.72, "learning_rate": 7.395815897959968e-06, "logits/chosen": -1.5221450328826904, "logits/rejected": -1.361803412437439, "logps/chosen": -75.06392669677734, "logps/rejected": -11.12886905670166, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": 5.227889537811279, "rewards/margins": 4.2975664138793945, "rewards/rejected": 0.9303231239318848, "step": 3260 }, { "epoch": 0.72, "learning_rate": 7.394242571401255e-06, "logits/chosen": -1.6577454805374146, "logits/rejected": -1.5271847248077393, "logps/chosen": -81.65669250488281, "logps/rejected": -50.51537322998047, "loss": 0.2304, "rewards/accuracies": 1.0, "rewards/chosen": 3.5842552185058594, "rewards/margins": 0.5385787487030029, "rewards/rejected": 3.0456764698028564, "step": 3261 }, { "epoch": 0.72, "learning_rate": 7.392668937188129e-06, "logits/chosen": -1.082839012145996, "logits/rejected": -0.9879686236381531, "logps/chosen": -38.059410095214844, "logps/rejected": -32.073211669921875, "loss": 1.5624, "rewards/accuracies": 0.0, "rewards/chosen": 2.2665653228759766, "rewards/margins": -1.6618907451629639, "rewards/rejected": 3.9284560680389404, "step": 3262 }, { "epoch": 0.72, "learning_rate": 7.391094995522796e-06, "logits/chosen": -1.3529220819473267, "logits/rejected": -1.3441370725631714, "logps/chosen": -66.26331329345703, "logps/rejected": -76.77474212646484, "loss": 0.7052, "rewards/accuracies": 0.0, "rewards/chosen": 3.2957725524902344, "rewards/margins": -1.1270408630371094, "rewards/rejected": 4.422813415527344, "step": 3263 }, { "epoch": 0.72, "learning_rate": 7.389520746607504e-06, "logits/chosen": -1.426871657371521, "logits/rejected": -1.4169949293136597, "logps/chosen": -44.350303649902344, "logps/rejected": -61.89799499511719, "loss": 4.6582, "rewards/accuracies": 0.0, "rewards/chosen": 2.916611433029175, "rewards/margins": -0.7530097961425781, "rewards/rejected": 3.669621229171753, "step": 3264 }, { "epoch": 0.72, "learning_rate": 7.38794619064454e-06, "logits/chosen": -1.3526537418365479, "logits/rejected": -1.3596233129501343, "logps/chosen": -74.49579620361328, "logps/rejected": -48.81298065185547, "loss": 1.7703, "rewards/accuracies": 0.0, "rewards/chosen": 3.30240797996521, "rewards/margins": -0.3229987621307373, "rewards/rejected": 3.6254067420959473, "step": 3265 }, { "epoch": 0.72, "learning_rate": 7.386371327836231e-06, "logits/chosen": -1.74008047580719, "logits/rejected": -1.5893540382385254, "logps/chosen": -109.50993347167969, "logps/rejected": -22.730487823486328, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 5.959393501281738, "rewards/margins": 5.47298002243042, "rewards/rejected": 0.4864133894443512, "step": 3266 }, { "epoch": 0.72, "learning_rate": 7.3847961583849424e-06, "logits/chosen": -1.476338267326355, "logits/rejected": -1.4382569789886475, "logps/chosen": -50.236671447753906, "logps/rejected": -47.65800094604492, "loss": 0.6489, "rewards/accuracies": 0.0, "rewards/chosen": 2.5618858337402344, "rewards/margins": -0.9273478984832764, "rewards/rejected": 3.4892337322235107, "step": 3267 }, { "epoch": 0.72, "learning_rate": 7.383220682493081e-06, "logits/chosen": -1.4729067087173462, "logits/rejected": -1.405484914779663, "logps/chosen": -98.91077423095703, "logps/rejected": -133.05697631835938, "loss": 0.1017, "rewards/accuracies": 1.0, "rewards/chosen": 8.078171730041504, "rewards/margins": 1.8106331825256348, "rewards/rejected": 6.267538547515869, "step": 3268 }, { "epoch": 0.72, "learning_rate": 7.38164490036309e-06, "logits/chosen": -1.2679102420806885, "logits/rejected": -1.21836519241333, "logps/chosen": -40.48772048950195, "logps/rejected": -37.00078582763672, "loss": 2.029, "rewards/accuracies": 1.0, "rewards/chosen": 3.505894184112549, "rewards/margins": 1.2424488067626953, "rewards/rejected": 2.2634453773498535, "step": 3269 }, { "epoch": 0.72, "learning_rate": 7.380068812197456e-06, "logits/chosen": -1.4138580560684204, "logits/rejected": -1.3626168966293335, "logps/chosen": -117.80827331542969, "logps/rejected": -94.81051635742188, "loss": 0.1881, "rewards/accuracies": 1.0, "rewards/chosen": 6.278781414031982, "rewards/margins": 2.480961799621582, "rewards/rejected": 3.7978196144104004, "step": 3270 }, { "epoch": 0.72, "learning_rate": 7.3784924181987e-06, "logits/chosen": -1.6495310068130493, "logits/rejected": -1.6882214546203613, "logps/chosen": -53.76765823364258, "logps/rejected": -70.53620910644531, "loss": 1.4313, "rewards/accuracies": 0.0, "rewards/chosen": 1.3004261255264282, "rewards/margins": -2.780299186706543, "rewards/rejected": 4.080725193023682, "step": 3271 }, { "epoch": 0.72, "learning_rate": 7.376915718569387e-06, "logits/chosen": -1.7324503660202026, "logits/rejected": -1.6110864877700806, "logps/chosen": -39.62275695800781, "logps/rejected": -17.15949249267578, "loss": 0.174, "rewards/accuracies": 1.0, "rewards/chosen": 2.902425527572632, "rewards/margins": 1.3006360530853271, "rewards/rejected": 1.6017894744873047, "step": 3272 }, { "epoch": 0.72, "learning_rate": 7.375338713512119e-06, "logits/chosen": -1.6972084045410156, "logits/rejected": -1.718867301940918, "logps/chosen": -31.582319259643555, "logps/rejected": -91.76193237304688, "loss": 0.4694, "rewards/accuracies": 0.0, "rewards/chosen": 2.0272042751312256, "rewards/margins": -0.4370870590209961, "rewards/rejected": 2.4642913341522217, "step": 3273 }, { "epoch": 0.72, "learning_rate": 7.373761403229536e-06, "logits/chosen": -2.021928071975708, "logits/rejected": -1.9930429458618164, "logps/chosen": -149.79635620117188, "logps/rejected": -35.45869064331055, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 4.345893859863281, "rewards/margins": 4.12244987487793, "rewards/rejected": 0.22344398498535156, "step": 3274 }, { "epoch": 0.72, "learning_rate": 7.372183787924317e-06, "logits/chosen": -1.3445007801055908, "logits/rejected": -1.2160147428512573, "logps/chosen": -45.63221740722656, "logps/rejected": -115.59443664550781, "loss": 4.4701, "rewards/accuracies": 0.0, "rewards/chosen": 2.3629701137542725, "rewards/margins": -5.8867597579956055, "rewards/rejected": 8.249730110168457, "step": 3275 }, { "epoch": 0.73, "learning_rate": 7.370605867799185e-06, "logits/chosen": -1.3431732654571533, "logits/rejected": -1.3431732654571533, "logps/chosen": -43.490089416503906, "logps/rejected": -43.490089416503906, "loss": 1.5502, "rewards/accuracies": 0.0, "rewards/chosen": 3.5151054859161377, "rewards/margins": 0.0, "rewards/rejected": 3.5151054859161377, "step": 3276 }, { "epoch": 0.73, "learning_rate": 7.369027643056898e-06, "logits/chosen": -1.7547487020492554, "logits/rejected": -1.656528353691101, "logps/chosen": -138.94593811035156, "logps/rejected": -62.29290008544922, "loss": 0.3186, "rewards/accuracies": 1.0, "rewards/chosen": 8.295100212097168, "rewards/margins": 1.1724505424499512, "rewards/rejected": 7.122649669647217, "step": 3277 }, { "epoch": 0.73, "learning_rate": 7.3674491139002534e-06, "logits/chosen": -1.8100529909133911, "logits/rejected": -1.7298022508621216, "logps/chosen": -50.9044189453125, "logps/rejected": -47.03901290893555, "loss": 0.2859, "rewards/accuracies": 1.0, "rewards/chosen": 3.3328819274902344, "rewards/margins": 0.4581165313720703, "rewards/rejected": 2.874765396118164, "step": 3278 }, { "epoch": 0.73, "learning_rate": 7.365870280532089e-06, "logits/chosen": -1.7727551460266113, "logits/rejected": -1.6708530187606812, "logps/chosen": -39.96454620361328, "logps/rejected": -9.055646896362305, "loss": 0.8888, "rewards/accuracies": 1.0, "rewards/chosen": 2.6848435401916504, "rewards/margins": 1.9720957279205322, "rewards/rejected": 0.7127477526664734, "step": 3279 }, { "epoch": 0.73, "learning_rate": 7.364291143155281e-06, "logits/chosen": -1.682655692100525, "logits/rejected": -1.6416616439819336, "logps/chosen": -71.68159484863281, "logps/rejected": -64.44596862792969, "loss": 0.2588, "rewards/accuracies": 1.0, "rewards/chosen": 4.87762975692749, "rewards/margins": 0.4997730255126953, "rewards/rejected": 4.377856731414795, "step": 3280 }, { "epoch": 0.73, "learning_rate": 7.362711701972746e-06, "logits/chosen": -1.7860639095306396, "logits/rejected": -1.7790395021438599, "logps/chosen": -51.224212646484375, "logps/rejected": -67.11874389648438, "loss": 0.4392, "rewards/accuracies": 0.0, "rewards/chosen": 3.187054395675659, "rewards/margins": -0.32156991958618164, "rewards/rejected": 3.508624315261841, "step": 3281 }, { "epoch": 0.73, "learning_rate": 7.361131957187435e-06, "logits/chosen": -1.3561887741088867, "logits/rejected": -1.3219656944274902, "logps/chosen": -55.10536193847656, "logps/rejected": -75.4952621459961, "loss": 1.4071, "rewards/accuracies": 0.0, "rewards/chosen": 2.840834140777588, "rewards/margins": -0.29067301750183105, "rewards/rejected": 3.131507158279419, "step": 3282 }, { "epoch": 0.73, "learning_rate": 7.359551909002345e-06, "logits/chosen": -1.5395426750183105, "logits/rejected": -1.4105952978134155, "logps/chosen": -45.84828186035156, "logps/rejected": -44.794918060302734, "loss": 0.7107, "rewards/accuracies": 0.0, "rewards/chosen": 2.092024326324463, "rewards/margins": -0.28866147994995117, "rewards/rejected": 2.380685806274414, "step": 3283 }, { "epoch": 0.73, "learning_rate": 7.3579715576205064e-06, "logits/chosen": -1.4781792163848877, "logits/rejected": -1.285197138786316, "logps/chosen": -103.85646057128906, "logps/rejected": -45.44792938232422, "loss": 0.8424, "rewards/accuracies": 1.0, "rewards/chosen": 6.6850175857543945, "rewards/margins": 4.19033145904541, "rewards/rejected": 2.4946861267089844, "step": 3284 }, { "epoch": 0.73, "learning_rate": 7.356390903244992e-06, "logits/chosen": -1.4269293546676636, "logits/rejected": -1.4501692056655884, "logps/chosen": -13.33482551574707, "logps/rejected": -49.641632080078125, "loss": 1.0218, "rewards/accuracies": 0.0, "rewards/chosen": 1.32558274269104, "rewards/margins": -0.5609270334243774, "rewards/rejected": 1.8865097761154175, "step": 3285 }, { "epoch": 0.73, "learning_rate": 7.354809946078909e-06, "logits/chosen": -1.6095702648162842, "logits/rejected": -1.6095702648162842, "logps/chosen": -5.523641586303711, "logps/rejected": -5.523641586303711, "loss": 0.5839, "rewards/accuracies": 0.0, "rewards/chosen": 0.7389621734619141, "rewards/margins": 0.0, "rewards/rejected": 0.7389621734619141, "step": 3286 }, { "epoch": 0.73, "learning_rate": 7.35322868632541e-06, "logits/chosen": -1.678885817527771, "logits/rejected": -1.6552966833114624, "logps/chosen": -81.26908874511719, "logps/rejected": -46.950103759765625, "loss": 2.0196, "rewards/accuracies": 1.0, "rewards/chosen": 4.61804723739624, "rewards/margins": 0.7909247875213623, "rewards/rejected": 3.827122449874878, "step": 3287 }, { "epoch": 0.73, "learning_rate": 7.351647124187682e-06, "logits/chosen": -1.5821442604064941, "logits/rejected": -1.535290241241455, "logps/chosen": -114.18684387207031, "logps/rejected": -53.65361785888672, "loss": 0.8739, "rewards/accuracies": 1.0, "rewards/chosen": 5.831364631652832, "rewards/margins": 2.5960915088653564, "rewards/rejected": 3.2352731227874756, "step": 3288 }, { "epoch": 0.73, "learning_rate": 7.35006525986895e-06, "logits/chosen": -1.4118732213974, "logits/rejected": -1.39844810962677, "logps/chosen": -34.50617218017578, "logps/rejected": -77.64945983886719, "loss": 1.0099, "rewards/accuracies": 1.0, "rewards/chosen": 3.303884267807007, "rewards/margins": 0.816378116607666, "rewards/rejected": 2.487506151199341, "step": 3289 }, { "epoch": 0.73, "learning_rate": 7.348483093572485e-06, "logits/chosen": -1.1609911918640137, "logits/rejected": -1.1609911918640137, "logps/chosen": -15.568793296813965, "logps/rejected": -15.568793296813965, "loss": 3.2782, "rewards/accuracies": 0.0, "rewards/chosen": 1.5020711421966553, "rewards/margins": 0.0, "rewards/rejected": 1.5020711421966553, "step": 3290 }, { "epoch": 0.73, "learning_rate": 7.346900625501585e-06, "logits/chosen": -1.5243943929672241, "logits/rejected": -1.3852063417434692, "logps/chosen": -108.01373291015625, "logps/rejected": -50.105892181396484, "loss": 0.4003, "rewards/accuracies": 1.0, "rewards/chosen": 6.164218425750732, "rewards/margins": 4.751720905303955, "rewards/rejected": 1.4124974012374878, "step": 3291 }, { "epoch": 0.73, "learning_rate": 7.345317855859597e-06, "logits/chosen": -1.9176313877105713, "logits/rejected": -1.8996660709381104, "logps/chosen": -31.845962524414062, "logps/rejected": -50.74501037597656, "loss": 0.2043, "rewards/accuracies": 1.0, "rewards/chosen": 2.2973697185516357, "rewards/margins": 0.7941218614578247, "rewards/rejected": 1.503247857093811, "step": 3292 }, { "epoch": 0.73, "learning_rate": 7.343734784849903e-06, "logits/chosen": -1.8389898538589478, "logits/rejected": -1.7959363460540771, "logps/chosen": -42.94227600097656, "logps/rejected": -45.49226379394531, "loss": 0.1894, "rewards/accuracies": 1.0, "rewards/chosen": 2.9614105224609375, "rewards/margins": 1.3415858745574951, "rewards/rejected": 1.6198246479034424, "step": 3293 }, { "epoch": 0.73, "learning_rate": 7.342151412675924e-06, "logits/chosen": -1.3003343343734741, "logits/rejected": -1.2775074243545532, "logps/chosen": -69.91999816894531, "logps/rejected": -61.64808654785156, "loss": 0.602, "rewards/accuracies": 1.0, "rewards/chosen": 3.207998752593994, "rewards/margins": 1.011824131011963, "rewards/rejected": 2.1961746215820312, "step": 3294 }, { "epoch": 0.73, "learning_rate": 7.340567739541118e-06, "logits/chosen": -1.650439739227295, "logits/rejected": -1.5175834894180298, "logps/chosen": -198.70037841796875, "logps/rejected": -69.44987487792969, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": 7.059347629547119, "rewards/margins": 2.623782157897949, "rewards/rejected": 4.43556547164917, "step": 3295 }, { "epoch": 0.73, "learning_rate": 7.338983765648985e-06, "logits/chosen": -1.595096230506897, "logits/rejected": -1.53441321849823, "logps/chosen": -45.94111633300781, "logps/rejected": -39.673583984375, "loss": 1.6316, "rewards/accuracies": 0.0, "rewards/chosen": 1.5867687463760376, "rewards/margins": -1.2351475954055786, "rewards/rejected": 2.821916341781616, "step": 3296 }, { "epoch": 0.73, "learning_rate": 7.3373994912030586e-06, "logits/chosen": -1.3714395761489868, "logits/rejected": -1.139098882675171, "logps/chosen": -79.59823608398438, "logps/rejected": -31.036752700805664, "loss": 0.5295, "rewards/accuracies": 1.0, "rewards/chosen": 4.833508491516113, "rewards/margins": 3.855454921722412, "rewards/rejected": 0.9780535101890564, "step": 3297 }, { "epoch": 0.73, "learning_rate": 7.3358149164069185e-06, "logits/chosen": -1.651909589767456, "logits/rejected": -1.6767597198486328, "logps/chosen": -80.55775451660156, "logps/rejected": -110.35477447509766, "loss": 1.3384, "rewards/accuracies": 0.0, "rewards/chosen": 5.001611232757568, "rewards/margins": -2.113297462463379, "rewards/rejected": 7.114908695220947, "step": 3298 }, { "epoch": 0.73, "learning_rate": 7.334230041464177e-06, "logits/chosen": -1.7900276184082031, "logits/rejected": -1.645325779914856, "logps/chosen": -133.06854248046875, "logps/rejected": -84.63591003417969, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 6.6529388427734375, "rewards/margins": 2.342349052429199, "rewards/rejected": 4.310589790344238, "step": 3299 }, { "epoch": 0.73, "learning_rate": 7.332644866578485e-06, "logits/chosen": -1.6300103664398193, "logits/rejected": -1.6136966943740845, "logps/chosen": -198.45289611816406, "logps/rejected": -125.18373107910156, "loss": 0.7583, "rewards/accuracies": 0.0, "rewards/chosen": 8.200657844543457, "rewards/margins": -0.7596683502197266, "rewards/rejected": 8.960326194763184, "step": 3300 }, { "epoch": 0.73, "learning_rate": 7.331059391953537e-06, "logits/chosen": -1.5307213068008423, "logits/rejected": -1.5733009576797485, "logps/chosen": -91.3842544555664, "logps/rejected": -125.13545227050781, "loss": 0.9405, "rewards/accuracies": 0.0, "rewards/chosen": 5.027092933654785, "rewards/margins": -1.7060933113098145, "rewards/rejected": 6.7331862449646, "step": 3301 }, { "epoch": 0.73, "learning_rate": 7.329473617793059e-06, "logits/chosen": -1.7092665433883667, "logits/rejected": -1.6980400085449219, "logps/chosen": -33.169822692871094, "logps/rejected": -92.0643081665039, "loss": 0.6575, "rewards/accuracies": 1.0, "rewards/chosen": 3.253664493560791, "rewards/margins": 0.2015235424041748, "rewards/rejected": 3.052140951156616, "step": 3302 }, { "epoch": 0.73, "learning_rate": 7.3278875443008215e-06, "logits/chosen": -1.3015780448913574, "logits/rejected": -1.2757500410079956, "logps/chosen": -36.58088684082031, "logps/rejected": -49.70856475830078, "loss": 0.2865, "rewards/accuracies": 1.0, "rewards/chosen": 4.464486122131348, "rewards/margins": 0.25894927978515625, "rewards/rejected": 4.205536842346191, "step": 3303 }, { "epoch": 0.73, "learning_rate": 7.326301171680632e-06, "logits/chosen": -1.8216289281845093, "logits/rejected": -1.847840666770935, "logps/chosen": -61.84014129638672, "logps/rejected": -52.418697357177734, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": 6.57809591293335, "rewards/margins": 4.309076309204102, "rewards/rejected": 2.269019842147827, "step": 3304 }, { "epoch": 0.73, "learning_rate": 7.3247145001363325e-06, "logits/chosen": -1.5897198915481567, "logits/rejected": -1.4996520280838013, "logps/chosen": -68.51606750488281, "logps/rejected": -20.156211853027344, "loss": 0.5317, "rewards/accuracies": 1.0, "rewards/chosen": 2.1325225830078125, "rewards/margins": 0.8171565532684326, "rewards/rejected": 1.3153660297393799, "step": 3305 }, { "epoch": 0.73, "learning_rate": 7.32312752987181e-06, "logits/chosen": -1.354088544845581, "logits/rejected": -1.3209737539291382, "logps/chosen": -52.83307647705078, "logps/rejected": -43.07524490356445, "loss": 0.7458, "rewards/accuracies": 0.0, "rewards/chosen": 2.378185272216797, "rewards/margins": -0.7527356147766113, "rewards/rejected": 3.130920886993408, "step": 3306 }, { "epoch": 0.73, "learning_rate": 7.321540261090983e-06, "logits/chosen": -1.8879257440567017, "logits/rejected": -1.7014728784561157, "logps/chosen": -73.27035522460938, "logps/rejected": -42.66377639770508, "loss": 0.8821, "rewards/accuracies": 1.0, "rewards/chosen": 6.596548557281494, "rewards/margins": 5.313718795776367, "rewards/rejected": 1.2828296422958374, "step": 3307 }, { "epoch": 0.73, "learning_rate": 7.319952693997814e-06, "logits/chosen": -1.7810368537902832, "logits/rejected": -1.8065667152404785, "logps/chosen": -50.11409378051758, "logps/rejected": -68.88267517089844, "loss": 0.8455, "rewards/accuracies": 0.0, "rewards/chosen": 2.8496150970458984, "rewards/margins": -0.760573148727417, "rewards/rejected": 3.6101882457733154, "step": 3308 }, { "epoch": 0.73, "learning_rate": 7.318364828796301e-06, "logits/chosen": -1.5856958627700806, "logits/rejected": -1.566084861755371, "logps/chosen": -136.11756896972656, "logps/rejected": -124.74644470214844, "loss": 0.2195, "rewards/accuracies": 1.0, "rewards/chosen": 9.794876098632812, "rewards/margins": 0.88385009765625, "rewards/rejected": 8.911026000976562, "step": 3309 }, { "epoch": 0.73, "learning_rate": 7.31677666569048e-06, "logits/chosen": -1.5634634494781494, "logits/rejected": -1.5190742015838623, "logps/chosen": -33.347755432128906, "logps/rejected": -13.694541931152344, "loss": 0.5119, "rewards/accuracies": 1.0, "rewards/chosen": 1.5735725164413452, "rewards/margins": 0.9367635250091553, "rewards/rejected": 0.6368089914321899, "step": 3310 }, { "epoch": 0.73, "learning_rate": 7.315188204884426e-06, "logits/chosen": -1.4537359476089478, "logits/rejected": -1.4557551145553589, "logps/chosen": -171.42660522460938, "logps/rejected": -248.61358642578125, "loss": 2.2108, "rewards/accuracies": 0.0, "rewards/chosen": 5.291484355926514, "rewards/margins": -4.408341884613037, "rewards/rejected": 9.69982624053955, "step": 3311 }, { "epoch": 0.73, "learning_rate": 7.3135994465822535e-06, "logits/chosen": -1.8414843082427979, "logits/rejected": -1.8687740564346313, "logps/chosen": -43.067176818847656, "logps/rejected": -35.86852264404297, "loss": 1.4635, "rewards/accuracies": 0.0, "rewards/chosen": 2.8095650672912598, "rewards/margins": -1.6169824600219727, "rewards/rejected": 4.426547527313232, "step": 3312 }, { "epoch": 0.73, "learning_rate": 7.312010390988115e-06, "logits/chosen": -1.5009963512420654, "logits/rejected": -1.4442168474197388, "logps/chosen": -59.510719299316406, "logps/rejected": -56.89753723144531, "loss": 1.0652, "rewards/accuracies": 0.0, "rewards/chosen": 3.7148232460021973, "rewards/margins": -1.220674991607666, "rewards/rejected": 4.935498237609863, "step": 3313 }, { "epoch": 0.73, "learning_rate": 7.310421038306199e-06, "logits/chosen": -1.3734689950942993, "logits/rejected": -1.2520699501037598, "logps/chosen": -83.6583251953125, "logps/rejected": -41.12409210205078, "loss": 0.3002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3358490467071533, "rewards/margins": 0.22521519660949707, "rewards/rejected": 2.1106338500976562, "step": 3314 }, { "epoch": 0.73, "learning_rate": 7.308831388740734e-06, "logits/chosen": -1.4838430881500244, "logits/rejected": -1.5066564083099365, "logps/chosen": -52.30121612548828, "logps/rejected": -84.87669372558594, "loss": 0.7701, "rewards/accuracies": 0.0, "rewards/chosen": 2.7706551551818848, "rewards/margins": -1.2703757286071777, "rewards/rejected": 4.0410308837890625, "step": 3315 }, { "epoch": 0.73, "learning_rate": 7.3072414424959855e-06, "logits/chosen": -1.1490912437438965, "logits/rejected": -0.9543330669403076, "logps/chosen": -73.00518035888672, "logps/rejected": -22.203332901000977, "loss": 0.3778, "rewards/accuracies": 1.0, "rewards/chosen": 2.105769395828247, "rewards/margins": 1.4015123844146729, "rewards/rejected": 0.7042570114135742, "step": 3316 }, { "epoch": 0.73, "learning_rate": 7.305651199776258e-06, "logits/chosen": -1.399003505706787, "logits/rejected": -1.1713775396347046, "logps/chosen": -76.80471801757812, "logps/rejected": -11.114823341369629, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 6.337466716766357, "rewards/margins": 5.618464469909668, "rewards/rejected": 0.7190024256706238, "step": 3317 }, { "epoch": 0.73, "learning_rate": 7.304060660785894e-06, "logits/chosen": -1.8716495037078857, "logits/rejected": -1.8423449993133545, "logps/chosen": -66.92596435546875, "logps/rejected": -10.982208251953125, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": 4.59176778793335, "rewards/margins": 3.819005250930786, "rewards/rejected": 0.7727624773979187, "step": 3318 }, { "epoch": 0.73, "learning_rate": 7.3024698257292734e-06, "logits/chosen": -1.6230926513671875, "logits/rejected": -1.5817675590515137, "logps/chosen": -120.3791275024414, "logps/rejected": -108.11123657226562, "loss": 1.4297, "rewards/accuracies": 0.0, "rewards/chosen": 7.238500118255615, "rewards/margins": -0.8673195838928223, "rewards/rejected": 8.105819702148438, "step": 3319 }, { "epoch": 0.73, "learning_rate": 7.300878694810815e-06, "logits/chosen": -1.5495221614837646, "logits/rejected": -1.5559567213058472, "logps/chosen": -49.03321075439453, "logps/rejected": -60.87506103515625, "loss": 1.2214, "rewards/accuracies": 1.0, "rewards/chosen": 5.429637432098389, "rewards/margins": 2.7439584732055664, "rewards/rejected": 2.6856789588928223, "step": 3320 }, { "epoch": 0.74, "learning_rate": 7.299287268234976e-06, "logits/chosen": -1.687182903289795, "logits/rejected": -1.582661747932434, "logps/chosen": -85.87065887451172, "logps/rejected": -70.1026611328125, "loss": 0.2755, "rewards/accuracies": 1.0, "rewards/chosen": 5.331955909729004, "rewards/margins": 0.4028449058532715, "rewards/rejected": 4.929111003875732, "step": 3321 }, { "epoch": 0.74, "learning_rate": 7.29769554620625e-06, "logits/chosen": -1.630556344985962, "logits/rejected": -1.6521373987197876, "logps/chosen": -52.93104553222656, "logps/rejected": -121.41926574707031, "loss": 4.2691, "rewards/accuracies": 0.0, "rewards/chosen": 3.4495849609375, "rewards/margins": -6.3987932205200195, "rewards/rejected": 9.84837818145752, "step": 3322 }, { "epoch": 0.74, "learning_rate": 7.296103528929172e-06, "logits/chosen": -1.605422854423523, "logits/rejected": -1.605422854423523, "logps/chosen": -52.195743560791016, "logps/rejected": -52.195743560791016, "loss": 1.0136, "rewards/accuracies": 0.0, "rewards/chosen": 5.630143642425537, "rewards/margins": 0.0, "rewards/rejected": 5.630143642425537, "step": 3323 }, { "epoch": 0.74, "learning_rate": 7.294511216608308e-06, "logits/chosen": -1.6341042518615723, "logits/rejected": -1.5468976497650146, "logps/chosen": -140.837646484375, "logps/rejected": -61.80032730102539, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": 8.829376220703125, "rewards/margins": 3.9621095657348633, "rewards/rejected": 4.867266654968262, "step": 3324 }, { "epoch": 0.74, "learning_rate": 7.2929186094482695e-06, "logits/chosen": -1.6952593326568604, "logits/rejected": -1.5819587707519531, "logps/chosen": -120.5503158569336, "logps/rejected": -44.89718246459961, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 8.285441398620605, "rewards/margins": 5.874161720275879, "rewards/rejected": 2.4112796783447266, "step": 3325 }, { "epoch": 0.74, "learning_rate": 7.2913257076537e-06, "logits/chosen": -1.799731969833374, "logits/rejected": -1.727966070175171, "logps/chosen": -33.600643157958984, "logps/rejected": -26.014690399169922, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": 3.1995670795440674, "rewards/margins": 2.2932677268981934, "rewards/rejected": 0.9062992334365845, "step": 3326 }, { "epoch": 0.74, "learning_rate": 7.289732511429286e-06, "logits/chosen": -1.4151833057403564, "logits/rejected": -1.373603343963623, "logps/chosen": -60.430641174316406, "logps/rejected": -61.531036376953125, "loss": 0.6709, "rewards/accuracies": 0.0, "rewards/chosen": 2.2911109924316406, "rewards/margins": -0.9560058116912842, "rewards/rejected": 3.247116804122925, "step": 3327 }, { "epoch": 0.74, "learning_rate": 7.28813902097975e-06, "logits/chosen": -1.5341426134109497, "logits/rejected": -1.4893704652786255, "logps/chosen": -55.895233154296875, "logps/rejected": -56.512176513671875, "loss": 0.9291, "rewards/accuracies": 0.0, "rewards/chosen": 2.6407997608184814, "rewards/margins": -1.6520216464996338, "rewards/rejected": 4.292821407318115, "step": 3328 }, { "epoch": 0.74, "learning_rate": 7.286545236509848e-06, "logits/chosen": -1.5916802883148193, "logits/rejected": -1.5916802883148193, "logps/chosen": -40.62104797363281, "logps/rejected": -40.62104797363281, "loss": 0.3478, "rewards/accuracies": 0.0, "rewards/chosen": 4.044549465179443, "rewards/margins": 0.0, "rewards/rejected": 4.044549465179443, "step": 3329 }, { "epoch": 0.74, "learning_rate": 7.284951158224381e-06, "logits/chosen": -1.6416858434677124, "logits/rejected": -1.4862524271011353, "logps/chosen": -107.16789245605469, "logps/rejected": -60.21741485595703, "loss": 0.418, "rewards/accuracies": 1.0, "rewards/chosen": 9.109969139099121, "rewards/margins": 4.8014655113220215, "rewards/rejected": 4.3085036277771, "step": 3330 }, { "epoch": 0.74, "learning_rate": 7.283356786328184e-06, "logits/chosen": -1.5726678371429443, "logits/rejected": -1.4430781602859497, "logps/chosen": -127.74186706542969, "logps/rejected": -39.11620330810547, "loss": 1.2933, "rewards/accuracies": 1.0, "rewards/chosen": 5.215327739715576, "rewards/margins": 0.9176340103149414, "rewards/rejected": 4.297693729400635, "step": 3331 }, { "epoch": 0.74, "learning_rate": 7.281762121026129e-06, "logits/chosen": -1.6604458093643188, "logits/rejected": -1.6937954425811768, "logps/chosen": -84.1131820678711, "logps/rejected": -86.673583984375, "loss": 1.2258, "rewards/accuracies": 0.0, "rewards/chosen": 3.7769951820373535, "rewards/margins": -2.351177215576172, "rewards/rejected": 6.128172397613525, "step": 3332 }, { "epoch": 0.74, "learning_rate": 7.280167162523125e-06, "logits/chosen": -1.915590524673462, "logits/rejected": -1.9009228944778442, "logps/chosen": -55.20825958251953, "logps/rejected": -63.03886413574219, "loss": 0.6356, "rewards/accuracies": 1.0, "rewards/chosen": 3.685849905014038, "rewards/margins": 0.04899144172668457, "rewards/rejected": 3.6368584632873535, "step": 3333 }, { "epoch": 0.74, "learning_rate": 7.278571911024124e-06, "logits/chosen": -1.5641307830810547, "logits/rejected": -1.4867432117462158, "logps/chosen": -47.137451171875, "logps/rejected": -54.54196548461914, "loss": 0.3019, "rewards/accuracies": 1.0, "rewards/chosen": 3.3668160438537598, "rewards/margins": 0.7272305488586426, "rewards/rejected": 2.639585494995117, "step": 3334 }, { "epoch": 0.74, "learning_rate": 7.276976366734109e-06, "logits/chosen": -1.5731233358383179, "logits/rejected": -1.5741134881973267, "logps/chosen": -71.68888854980469, "logps/rejected": -65.53245544433594, "loss": 2.7234, "rewards/accuracies": 0.0, "rewards/chosen": 2.4159018993377686, "rewards/margins": -0.9635934829711914, "rewards/rejected": 3.37949538230896, "step": 3335 }, { "epoch": 0.74, "learning_rate": 7.2753805298581035e-06, "logits/chosen": -1.5514342784881592, "logits/rejected": -1.5274614095687866, "logps/chosen": -55.75445556640625, "logps/rejected": -56.98611068725586, "loss": 0.1773, "rewards/accuracies": 1.0, "rewards/chosen": 3.291670322418213, "rewards/margins": 0.8971965312957764, "rewards/rejected": 2.3944737911224365, "step": 3336 }, { "epoch": 0.74, "learning_rate": 7.273784400601171e-06, "logits/chosen": -1.6325855255126953, "logits/rejected": -1.6543388366699219, "logps/chosen": -61.843955993652344, "logps/rejected": -37.01481628417969, "loss": 0.9952, "rewards/accuracies": 0.0, "rewards/chosen": 2.3482933044433594, "rewards/margins": -0.9781129360198975, "rewards/rejected": 3.326406240463257, "step": 3337 }, { "epoch": 0.74, "learning_rate": 7.272187979168408e-06, "logits/chosen": -1.4361554384231567, "logits/rejected": -1.465973973274231, "logps/chosen": -71.45193481445312, "logps/rejected": -50.0472297668457, "loss": 0.3669, "rewards/accuracies": 0.0, "rewards/chosen": 4.760346412658691, "rewards/margins": -0.06395196914672852, "rewards/rejected": 4.82429838180542, "step": 3338 }, { "epoch": 0.74, "learning_rate": 7.270591265764952e-06, "logits/chosen": -1.6193057298660278, "logits/rejected": -1.6270332336425781, "logps/chosen": -52.760353088378906, "logps/rejected": -101.192138671875, "loss": 1.492, "rewards/accuracies": 0.0, "rewards/chosen": 2.3698647022247314, "rewards/margins": -1.9386694431304932, "rewards/rejected": 4.308534145355225, "step": 3339 }, { "epoch": 0.74, "learning_rate": 7.2689942605959784e-06, "logits/chosen": -1.373737096786499, "logits/rejected": -1.3694870471954346, "logps/chosen": -121.90116119384766, "logps/rejected": -127.85130310058594, "loss": 0.0851, "rewards/accuracies": 1.0, "rewards/chosen": 6.747518062591553, "rewards/margins": 1.7458500862121582, "rewards/rejected": 5.0016679763793945, "step": 3340 }, { "epoch": 0.74, "learning_rate": 7.267396963866696e-06, "logits/chosen": -1.4202510118484497, "logits/rejected": -1.28911292552948, "logps/chosen": -43.799705505371094, "logps/rejected": -89.11480712890625, "loss": 2.6686, "rewards/accuracies": 1.0, "rewards/chosen": 2.4898293018341064, "rewards/margins": 1.5883164405822754, "rewards/rejected": 0.9015129208564758, "step": 3341 }, { "epoch": 0.74, "learning_rate": 7.265799375782354e-06, "logits/chosen": -1.5492756366729736, "logits/rejected": -1.4907584190368652, "logps/chosen": -44.570762634277344, "logps/rejected": -52.67605972290039, "loss": 0.7439, "rewards/accuracies": 1.0, "rewards/chosen": 4.41113805770874, "rewards/margins": 1.3865976333618164, "rewards/rejected": 3.024540424346924, "step": 3342 }, { "epoch": 0.74, "learning_rate": 7.264201496548239e-06, "logits/chosen": -1.7906029224395752, "logits/rejected": -1.8056186437606812, "logps/chosen": -99.88375091552734, "logps/rejected": -91.4031982421875, "loss": 1.8997, "rewards/accuracies": 0.0, "rewards/chosen": 6.329201698303223, "rewards/margins": -3.579334259033203, "rewards/rejected": 9.908535957336426, "step": 3343 }, { "epoch": 0.74, "learning_rate": 7.262603326369675e-06, "logits/chosen": -1.1466443538665771, "logits/rejected": -1.1466443538665771, "logps/chosen": -15.301383018493652, "logps/rejected": -15.301383018493652, "loss": 0.3495, "rewards/accuracies": 0.0, "rewards/chosen": 3.176577091217041, "rewards/margins": 0.0, "rewards/rejected": 3.176577091217041, "step": 3344 }, { "epoch": 0.74, "learning_rate": 7.261004865452024e-06, "logits/chosen": -1.3892711400985718, "logits/rejected": -1.3036600351333618, "logps/chosen": -52.566612243652344, "logps/rejected": -42.56349182128906, "loss": 0.3844, "rewards/accuracies": 0.0, "rewards/chosen": 2.5835914611816406, "rewards/margins": -0.04761052131652832, "rewards/rejected": 2.631201982498169, "step": 3345 }, { "epoch": 0.74, "learning_rate": 7.259406114000681e-06, "logits/chosen": -1.796571135520935, "logits/rejected": -1.796571135520935, "logps/chosen": -54.70109176635742, "logps/rejected": -54.70109176635742, "loss": 0.8124, "rewards/accuracies": 0.0, "rewards/chosen": 2.4513440132141113, "rewards/margins": 0.0, "rewards/rejected": 2.4513440132141113, "step": 3346 }, { "epoch": 0.74, "learning_rate": 7.257807072221084e-06, "logits/chosen": -1.5045348405838013, "logits/rejected": -1.5045348405838013, "logps/chosen": -55.48483657836914, "logps/rejected": -55.48483657836914, "loss": 0.8264, "rewards/accuracies": 0.0, "rewards/chosen": 3.9095304012298584, "rewards/margins": 0.0, "rewards/rejected": 3.9095304012298584, "step": 3347 }, { "epoch": 0.74, "learning_rate": 7.256207740318706e-06, "logits/chosen": -1.7393049001693726, "logits/rejected": -1.7029746770858765, "logps/chosen": -70.4522705078125, "logps/rejected": -73.52815246582031, "loss": 1.6149, "rewards/accuracies": 0.0, "rewards/chosen": 1.8907577991485596, "rewards/margins": -2.790126085281372, "rewards/rejected": 4.680883884429932, "step": 3348 }, { "epoch": 0.74, "learning_rate": 7.254608118499058e-06, "logits/chosen": -1.353722333908081, "logits/rejected": -1.3381067514419556, "logps/chosen": -36.5557861328125, "logps/rejected": -23.058996200561523, "loss": 0.4081, "rewards/accuracies": 1.0, "rewards/chosen": 3.598724365234375, "rewards/margins": 2.03242826461792, "rewards/rejected": 1.5662962198257446, "step": 3349 }, { "epoch": 0.74, "learning_rate": 7.253008206967686e-06, "logits/chosen": -1.7501884698867798, "logits/rejected": -1.7501884698867798, "logps/chosen": -38.3196907043457, "logps/rejected": -38.3196907043457, "loss": 1.3824, "rewards/accuracies": 0.0, "rewards/chosen": 1.884398341178894, "rewards/margins": 0.0, "rewards/rejected": 1.884398341178894, "step": 3350 }, { "epoch": 0.74, "learning_rate": 7.251408005930176e-06, "logits/chosen": -1.5495575666427612, "logits/rejected": -1.3058873414993286, "logps/chosen": -121.25570678710938, "logps/rejected": -40.15468215942383, "loss": 0.7067, "rewards/accuracies": 1.0, "rewards/chosen": 6.1626877784729, "rewards/margins": 3.581277847290039, "rewards/rejected": 2.5814099311828613, "step": 3351 }, { "epoch": 0.74, "learning_rate": 7.249807515592149e-06, "logits/chosen": -1.9328030347824097, "logits/rejected": -1.8894237279891968, "logps/chosen": -85.29712677001953, "logps/rejected": -44.189353942871094, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 8.409340858459473, "rewards/margins": 4.314929962158203, "rewards/rejected": 4.0944108963012695, "step": 3352 }, { "epoch": 0.74, "learning_rate": 7.248206736159264e-06, "logits/chosen": -1.449469804763794, "logits/rejected": -1.46703040599823, "logps/chosen": -31.242021560668945, "logps/rejected": -21.66986656188965, "loss": 0.4468, "rewards/accuracies": 0.0, "rewards/chosen": 2.1065986156463623, "rewards/margins": -0.28242945671081543, "rewards/rejected": 2.3890280723571777, "step": 3353 }, { "epoch": 0.74, "learning_rate": 7.246605667837219e-06, "logits/chosen": -1.327497959136963, "logits/rejected": -1.3641666173934937, "logps/chosen": -61.64289474487305, "logps/rejected": -118.12026977539062, "loss": 3.1488, "rewards/accuracies": 0.0, "rewards/chosen": 3.2916600704193115, "rewards/margins": -6.218006134033203, "rewards/rejected": 9.509666442871094, "step": 3354 }, { "epoch": 0.74, "learning_rate": 7.245004310831747e-06, "logits/chosen": -1.5811965465545654, "logits/rejected": -1.562431812286377, "logps/chosen": -48.67958450317383, "logps/rejected": -57.374473571777344, "loss": 1.0603, "rewards/accuracies": 0.0, "rewards/chosen": 2.486710786819458, "rewards/margins": -1.818289041519165, "rewards/rejected": 4.304999828338623, "step": 3355 }, { "epoch": 0.74, "learning_rate": 7.2434026653486165e-06, "logits/chosen": -1.658192753791809, "logits/rejected": -1.680256962776184, "logps/chosen": -172.82737731933594, "logps/rejected": -99.37528991699219, "loss": 0.3146, "rewards/accuracies": 1.0, "rewards/chosen": 6.7852020263671875, "rewards/margins": 0.1420731544494629, "rewards/rejected": 6.643128871917725, "step": 3356 }, { "epoch": 0.74, "learning_rate": 7.241800731593636e-06, "logits/chosen": -1.6867161989212036, "logits/rejected": -1.60872483253479, "logps/chosen": -71.75768280029297, "logps/rejected": -26.66391944885254, "loss": 0.8108, "rewards/accuracies": 0.0, "rewards/chosen": 1.4756386280059814, "rewards/margins": -1.3880460262298584, "rewards/rejected": 2.86368465423584, "step": 3357 }, { "epoch": 0.74, "learning_rate": 7.2401985097726534e-06, "logits/chosen": -1.4256434440612793, "logits/rejected": -1.4244314432144165, "logps/chosen": -45.15983581542969, "logps/rejected": -74.93379974365234, "loss": 0.6206, "rewards/accuracies": 1.0, "rewards/chosen": 2.899859666824341, "rewards/margins": 0.4582092761993408, "rewards/rejected": 2.441650390625, "step": 3358 }, { "epoch": 0.74, "learning_rate": 7.238596000091546e-06, "logits/chosen": -1.2149643898010254, "logits/rejected": -1.2057026624679565, "logps/chosen": -34.897735595703125, "logps/rejected": -67.57551574707031, "loss": 2.8879, "rewards/accuracies": 0.0, "rewards/chosen": 1.4679489135742188, "rewards/margins": -1.5004403591156006, "rewards/rejected": 2.9683892726898193, "step": 3359 }, { "epoch": 0.74, "learning_rate": 7.236993202756233e-06, "logits/chosen": -1.3723996877670288, "logits/rejected": -1.3692469596862793, "logps/chosen": -54.863853454589844, "logps/rejected": -42.387596130371094, "loss": 0.5944, "rewards/accuracies": 0.0, "rewards/chosen": 2.2969071865081787, "rewards/margins": -0.8231086730957031, "rewards/rejected": 3.120015859603882, "step": 3360 }, { "epoch": 0.74, "learning_rate": 7.2353901179726715e-06, "logits/chosen": -1.634771704673767, "logits/rejected": -1.2780119180679321, "logps/chosen": -48.85631561279297, "logps/rejected": -84.72330474853516, "loss": 1.5361, "rewards/accuracies": 0.0, "rewards/chosen": 3.1622016429901123, "rewards/margins": -2.6175220012664795, "rewards/rejected": 5.779723644256592, "step": 3361 }, { "epoch": 0.74, "learning_rate": 7.233786745946854e-06, "logits/chosen": -1.4538387060165405, "logits/rejected": -1.44734525680542, "logps/chosen": -72.80662536621094, "logps/rejected": -61.33636474609375, "loss": 1.6097, "rewards/accuracies": 1.0, "rewards/chosen": 4.51171875, "rewards/margins": 1.0481040477752686, "rewards/rejected": 3.4636147022247314, "step": 3362 }, { "epoch": 0.74, "learning_rate": 7.232183086884809e-06, "logits/chosen": -1.4901087284088135, "logits/rejected": -1.439390778541565, "logps/chosen": -88.96818542480469, "logps/rejected": -121.16275024414062, "loss": 1.1144, "rewards/accuracies": 0.0, "rewards/chosen": 4.851603984832764, "rewards/margins": -1.7494001388549805, "rewards/rejected": 6.601004123687744, "step": 3363 }, { "epoch": 0.74, "learning_rate": 7.230579140992604e-06, "logits/chosen": -1.549822449684143, "logits/rejected": -1.0832858085632324, "logps/chosen": -116.89418029785156, "logps/rejected": -124.65202331542969, "loss": 0.2426, "rewards/accuracies": 1.0, "rewards/chosen": 4.7713303565979, "rewards/margins": 0.5105924606323242, "rewards/rejected": 4.260737895965576, "step": 3364 }, { "epoch": 0.74, "learning_rate": 7.2289749084763415e-06, "logits/chosen": -1.3303914070129395, "logits/rejected": -1.3217277526855469, "logps/chosen": -67.30851745605469, "logps/rejected": -26.344375610351562, "loss": 0.6093, "rewards/accuracies": 1.0, "rewards/chosen": 1.6326111555099487, "rewards/margins": 0.05009198188781738, "rewards/rejected": 1.5825191736221313, "step": 3365 }, { "epoch": 0.75, "learning_rate": 7.227370389542161e-06, "logits/chosen": -1.6056859493255615, "logits/rejected": -1.6043204069137573, "logps/chosen": -43.927589416503906, "logps/rejected": -51.568424224853516, "loss": 0.8004, "rewards/accuracies": 1.0, "rewards/chosen": 2.7660751342773438, "rewards/margins": 0.47466540336608887, "rewards/rejected": 2.291409730911255, "step": 3366 }, { "epoch": 0.75, "learning_rate": 7.225765584396241e-06, "logits/chosen": -1.4875961542129517, "logits/rejected": -1.5188863277435303, "logps/chosen": -37.36771774291992, "logps/rejected": -42.22905349731445, "loss": 1.5133, "rewards/accuracies": 0.0, "rewards/chosen": 2.1746761798858643, "rewards/margins": -1.151301622390747, "rewards/rejected": 3.3259778022766113, "step": 3367 }, { "epoch": 0.75, "learning_rate": 7.224160493244794e-06, "logits/chosen": -1.3229283094406128, "logits/rejected": -1.3110932111740112, "logps/chosen": -118.40696716308594, "logps/rejected": -73.2928466796875, "loss": 1.1033, "rewards/accuracies": 0.0, "rewards/chosen": 5.8206634521484375, "rewards/margins": -0.3709731101989746, "rewards/rejected": 6.191636562347412, "step": 3368 }, { "epoch": 0.75, "learning_rate": 7.222555116294069e-06, "logits/chosen": -1.3795239925384521, "logits/rejected": -1.369950771331787, "logps/chosen": -54.25190353393555, "logps/rejected": -11.428409576416016, "loss": 0.1022, "rewards/accuracies": 1.0, "rewards/chosen": 3.209383010864258, "rewards/margins": 2.142383575439453, "rewards/rejected": 1.0669994354248047, "step": 3369 }, { "epoch": 0.75, "learning_rate": 7.220949453750355e-06, "logits/chosen": -1.442967176437378, "logits/rejected": -1.493927240371704, "logps/chosen": -59.25270080566406, "logps/rejected": -47.747215270996094, "loss": 2.0537, "rewards/accuracies": 0.0, "rewards/chosen": 1.9432334899902344, "rewards/margins": -4.088243007659912, "rewards/rejected": 6.0314764976501465, "step": 3370 }, { "epoch": 0.75, "learning_rate": 7.219343505819975e-06, "logits/chosen": -1.1184810400009155, "logits/rejected": -1.058617353439331, "logps/chosen": -51.34865188598633, "logps/rejected": -38.080169677734375, "loss": 1.9661, "rewards/accuracies": 0.0, "rewards/chosen": 2.146014928817749, "rewards/margins": -0.7284433841705322, "rewards/rejected": 2.8744583129882812, "step": 3371 }, { "epoch": 0.75, "learning_rate": 7.21773727270929e-06, "logits/chosen": -1.5817334651947021, "logits/rejected": -1.5281343460083008, "logps/chosen": -43.61981964111328, "logps/rejected": -46.59297561645508, "loss": 0.0683, "rewards/accuracies": 1.0, "rewards/chosen": 3.5996780395507812, "rewards/margins": 1.9327083826065063, "rewards/rejected": 1.666969656944275, "step": 3372 }, { "epoch": 0.75, "learning_rate": 7.216130754624696e-06, "logits/chosen": -1.586921215057373, "logits/rejected": -1.5663808584213257, "logps/chosen": -103.58122253417969, "logps/rejected": -89.73562622070312, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 8.519187927246094, "rewards/margins": 4.018988132476807, "rewards/rejected": 4.500199794769287, "step": 3373 }, { "epoch": 0.75, "learning_rate": 7.21452395177263e-06, "logits/chosen": -1.6872525215148926, "logits/rejected": -1.7068711519241333, "logps/chosen": -74.92265319824219, "logps/rejected": -63.04228973388672, "loss": 0.4989, "rewards/accuracies": 0.0, "rewards/chosen": 3.786675214767456, "rewards/margins": -0.4575355052947998, "rewards/rejected": 4.244210720062256, "step": 3374 }, { "epoch": 0.75, "learning_rate": 7.212916864359557e-06, "logits/chosen": -1.5028749704360962, "logits/rejected": -1.444767713546753, "logps/chosen": -68.00598907470703, "logps/rejected": -73.78175354003906, "loss": 0.5041, "rewards/accuracies": 0.0, "rewards/chosen": 3.145754337310791, "rewards/margins": -0.21247410774230957, "rewards/rejected": 3.3582284450531006, "step": 3375 }, { "epoch": 0.75, "learning_rate": 7.211309492591988e-06, "logits/chosen": -1.2985620498657227, "logits/rejected": -1.3528777360916138, "logps/chosen": -89.61373901367188, "logps/rejected": -70.36726379394531, "loss": 0.2561, "rewards/accuracies": 1.0, "rewards/chosen": 5.14956521987915, "rewards/margins": 0.591555118560791, "rewards/rejected": 4.558010101318359, "step": 3376 }, { "epoch": 0.75, "learning_rate": 7.209701836676465e-06, "logits/chosen": -1.9254913330078125, "logits/rejected": -1.8885399103164673, "logps/chosen": -51.456825256347656, "logps/rejected": -27.23768424987793, "loss": 0.179, "rewards/accuracies": 1.0, "rewards/chosen": 2.7580039501190186, "rewards/margins": 3.1452479362487793, "rewards/rejected": -0.3872440457344055, "step": 3377 }, { "epoch": 0.75, "learning_rate": 7.208093896819567e-06, "logits/chosen": -1.5976144075393677, "logits/rejected": -1.516768217086792, "logps/chosen": -177.27777099609375, "logps/rejected": -291.86175537109375, "loss": 0.9647, "rewards/accuracies": 0.0, "rewards/chosen": 6.917697429656982, "rewards/margins": -1.7712979316711426, "rewards/rejected": 8.688995361328125, "step": 3378 }, { "epoch": 0.75, "learning_rate": 7.206485673227912e-06, "logits/chosen": -1.5099672079086304, "logits/rejected": -1.4664149284362793, "logps/chosen": -190.98431396484375, "logps/rejected": -52.13262176513672, "loss": 0.2219, "rewards/accuracies": 1.0, "rewards/chosen": 4.212407112121582, "rewards/margins": 1.257016897201538, "rewards/rejected": 2.955390214920044, "step": 3379 }, { "epoch": 0.75, "learning_rate": 7.2048771661081515e-06, "logits/chosen": -1.6795393228530884, "logits/rejected": -1.6575006246566772, "logps/chosen": -58.88589859008789, "logps/rejected": -70.35237121582031, "loss": 2.4368, "rewards/accuracies": 0.0, "rewards/chosen": 3.2482998371124268, "rewards/margins": -0.4519679546356201, "rewards/rejected": 3.700267791748047, "step": 3380 }, { "epoch": 0.75, "learning_rate": 7.203268375666976e-06, "logits/chosen": -1.226733922958374, "logits/rejected": -1.174187183380127, "logps/chosen": -38.044593811035156, "logps/rejected": -23.960468292236328, "loss": 0.7396, "rewards/accuracies": 0.0, "rewards/chosen": 1.3042877912521362, "rewards/margins": -1.1076735258102417, "rewards/rejected": 2.411961317062378, "step": 3381 }, { "epoch": 0.75, "learning_rate": 7.201659302111111e-06, "logits/chosen": -1.4045454263687134, "logits/rejected": -1.3524688482284546, "logps/chosen": -133.3416290283203, "logps/rejected": -39.64311599731445, "loss": 0.0949, "rewards/accuracies": 1.0, "rewards/chosen": 5.427879333496094, "rewards/margins": 1.8620274066925049, "rewards/rejected": 3.565851926803589, "step": 3382 }, { "epoch": 0.75, "learning_rate": 7.2000499456473186e-06, "logits/chosen": -1.2929514646530151, "logits/rejected": -0.901875376701355, "logps/chosen": -66.5008544921875, "logps/rejected": -62.155921936035156, "loss": 1.1868, "rewards/accuracies": 0.0, "rewards/chosen": 2.344134569168091, "rewards/margins": -0.870175838470459, "rewards/rejected": 3.21431040763855, "step": 3383 }, { "epoch": 0.75, "learning_rate": 7.198440306482397e-06, "logits/chosen": -1.5241512060165405, "logits/rejected": -1.4526079893112183, "logps/chosen": -55.09733200073242, "logps/rejected": -44.736793518066406, "loss": 0.2055, "rewards/accuracies": 1.0, "rewards/chosen": 4.346296310424805, "rewards/margins": 1.6238696575164795, "rewards/rejected": 2.722426652908325, "step": 3384 }, { "epoch": 0.75, "learning_rate": 7.19683038482318e-06, "logits/chosen": -1.5313266515731812, "logits/rejected": -1.4969829320907593, "logps/chosen": -66.5343246459961, "logps/rejected": -52.331512451171875, "loss": 0.9225, "rewards/accuracies": 1.0, "rewards/chosen": 2.5931594371795654, "rewards/margins": 0.17584824562072754, "rewards/rejected": 2.417311191558838, "step": 3385 }, { "epoch": 0.75, "learning_rate": 7.195220180876541e-06, "logits/chosen": -1.482917070388794, "logits/rejected": -1.222415566444397, "logps/chosen": -115.04396057128906, "logps/rejected": -19.74828338623047, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 6.4056243896484375, "rewards/margins": 4.395932674407959, "rewards/rejected": 2.0096917152404785, "step": 3386 }, { "epoch": 0.75, "learning_rate": 7.193609694849385e-06, "logits/chosen": -1.5036933422088623, "logits/rejected": -1.4709279537200928, "logps/chosen": -44.35907745361328, "logps/rejected": -57.685691833496094, "loss": 1.6481, "rewards/accuracies": 0.0, "rewards/chosen": 2.5386626720428467, "rewards/margins": -0.7493150234222412, "rewards/rejected": 3.287977695465088, "step": 3387 }, { "epoch": 0.75, "learning_rate": 7.191998926948655e-06, "logits/chosen": -1.5359015464782715, "logits/rejected": -1.545906901359558, "logps/chosen": -77.29573822021484, "logps/rejected": -59.79296112060547, "loss": 2.1, "rewards/accuracies": 0.0, "rewards/chosen": 2.8559136390686035, "rewards/margins": -2.447004795074463, "rewards/rejected": 5.302918434143066, "step": 3388 }, { "epoch": 0.75, "learning_rate": 7.190387877381335e-06, "logits/chosen": -1.7441469430923462, "logits/rejected": -1.743830919265747, "logps/chosen": -106.62339782714844, "logps/rejected": -82.83213806152344, "loss": 2.2758, "rewards/accuracies": 0.0, "rewards/chosen": 5.175624370574951, "rewards/margins": -4.360009670257568, "rewards/rejected": 9.53563404083252, "step": 3389 }, { "epoch": 0.75, "learning_rate": 7.188776546354437e-06, "logits/chosen": -1.4374300241470337, "logits/rejected": -1.4138139486312866, "logps/chosen": -24.910594940185547, "logps/rejected": -23.300582885742188, "loss": 0.6436, "rewards/accuracies": 1.0, "rewards/chosen": 2.251945972442627, "rewards/margins": 0.20916748046875, "rewards/rejected": 2.042778491973877, "step": 3390 }, { "epoch": 0.75, "learning_rate": 7.1871649340750146e-06, "logits/chosen": -1.444591999053955, "logits/rejected": -1.4674614667892456, "logps/chosen": -48.188053131103516, "logps/rejected": -90.0555191040039, "loss": 2.3251, "rewards/accuracies": 0.0, "rewards/chosen": 4.284091472625732, "rewards/margins": -0.2610797882080078, "rewards/rejected": 4.54517126083374, "step": 3391 }, { "epoch": 0.75, "learning_rate": 7.185553040750156e-06, "logits/chosen": -1.613829255104065, "logits/rejected": -1.6561347246170044, "logps/chosen": -143.29251098632812, "logps/rejected": -122.57561492919922, "loss": 0.479, "rewards/accuracies": 0.0, "rewards/chosen": 5.535945415496826, "rewards/margins": -0.47242164611816406, "rewards/rejected": 6.00836706161499, "step": 3392 }, { "epoch": 0.75, "learning_rate": 7.183940866586986e-06, "logits/chosen": -1.5886242389678955, "logits/rejected": -1.5347509384155273, "logps/chosen": -91.75035095214844, "logps/rejected": -53.53818893432617, "loss": 0.8867, "rewards/accuracies": 1.0, "rewards/chosen": 4.89919900894165, "rewards/margins": 1.258284330368042, "rewards/rejected": 3.6409146785736084, "step": 3393 }, { "epoch": 0.75, "learning_rate": 7.182328411792664e-06, "logits/chosen": -1.6438509225845337, "logits/rejected": -1.525793433189392, "logps/chosen": -38.480804443359375, "logps/rejected": -44.661109924316406, "loss": 1.1758, "rewards/accuracies": 0.0, "rewards/chosen": 2.491014242172241, "rewards/margins": -0.653501033782959, "rewards/rejected": 3.1445152759552, "step": 3394 }, { "epoch": 0.75, "learning_rate": 7.180715676574387e-06, "logits/chosen": -1.3872027397155762, "logits/rejected": -1.3962432146072388, "logps/chosen": -50.15869903564453, "logps/rejected": -86.68325805664062, "loss": 2.557, "rewards/accuracies": 0.0, "rewards/chosen": 3.0681190490722656, "rewards/margins": -2.4386191368103027, "rewards/rejected": 5.506738185882568, "step": 3395 }, { "epoch": 0.75, "learning_rate": 7.17910266113939e-06, "logits/chosen": -1.9303045272827148, "logits/rejected": -1.9116369485855103, "logps/chosen": -125.33465576171875, "logps/rejected": -52.49464416503906, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 6.611447334289551, "rewards/margins": 5.450955390930176, "rewards/rejected": 1.160491943359375, "step": 3396 }, { "epoch": 0.75, "learning_rate": 7.177489365694936e-06, "logits/chosen": -1.566739797592163, "logits/rejected": -1.5204325914382935, "logps/chosen": -40.690521240234375, "logps/rejected": -27.89621353149414, "loss": 0.5874, "rewards/accuracies": 1.0, "rewards/chosen": 2.0508038997650146, "rewards/margins": 0.157950758934021, "rewards/rejected": 1.8928531408309937, "step": 3397 }, { "epoch": 0.75, "learning_rate": 7.175875790448335e-06, "logits/chosen": -1.6722484827041626, "logits/rejected": -1.588194489479065, "logps/chosen": -172.1226806640625, "logps/rejected": -65.48179626464844, "loss": 0.5096, "rewards/accuracies": 1.0, "rewards/chosen": 6.517356872558594, "rewards/margins": 1.0175366401672363, "rewards/rejected": 5.499820232391357, "step": 3398 }, { "epoch": 0.75, "learning_rate": 7.174261935606925e-06, "logits/chosen": -1.462870478630066, "logits/rejected": -1.3669935464859009, "logps/chosen": -63.74000549316406, "logps/rejected": -18.905868530273438, "loss": 2.6386, "rewards/accuracies": 1.0, "rewards/chosen": 3.7143173217773438, "rewards/margins": 3.3610024452209473, "rewards/rejected": 0.3533147871494293, "step": 3399 }, { "epoch": 0.75, "learning_rate": 7.172647801378086e-06, "logits/chosen": -1.7801419496536255, "logits/rejected": -1.6322782039642334, "logps/chosen": -118.67616271972656, "logps/rejected": -76.27076721191406, "loss": 0.3505, "rewards/accuracies": 1.0, "rewards/chosen": 5.7431535720825195, "rewards/margins": 2.893432855606079, "rewards/rejected": 2.8497207164764404, "step": 3400 }, { "epoch": 0.75, "learning_rate": 7.171033387969224e-06, "logits/chosen": -1.4505397081375122, "logits/rejected": -1.4148638248443604, "logps/chosen": -102.588623046875, "logps/rejected": -51.08868408203125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 6.655453681945801, "rewards/margins": 4.115225791931152, "rewards/rejected": 2.5402276515960693, "step": 3401 }, { "epoch": 0.75, "learning_rate": 7.169418695587791e-06, "logits/chosen": -1.4829305410385132, "logits/rejected": -1.4829305410385132, "logps/chosen": -13.016969680786133, "logps/rejected": -13.016969680786133, "loss": 1.1786, "rewards/accuracies": 0.0, "rewards/chosen": 3.0078647136688232, "rewards/margins": 0.0, "rewards/rejected": 3.0078647136688232, "step": 3402 }, { "epoch": 0.75, "learning_rate": 7.167803724441271e-06, "logits/chosen": -1.621799111366272, "logits/rejected": -1.1390807628631592, "logps/chosen": -72.68179321289062, "logps/rejected": -91.43478393554688, "loss": 0.8697, "rewards/accuracies": 0.0, "rewards/chosen": 2.6626052856445312, "rewards/margins": -0.9618699550628662, "rewards/rejected": 3.6244752407073975, "step": 3403 }, { "epoch": 0.75, "learning_rate": 7.166188474737184e-06, "logits/chosen": -1.7536470890045166, "logits/rejected": -1.6952838897705078, "logps/chosen": -97.64384460449219, "logps/rejected": -71.71727752685547, "loss": 0.0672, "rewards/accuracies": 1.0, "rewards/chosen": 7.959123134613037, "rewards/margins": 2.003668785095215, "rewards/rejected": 5.955454349517822, "step": 3404 }, { "epoch": 0.75, "learning_rate": 7.164572946683086e-06, "logits/chosen": -1.6313741207122803, "logits/rejected": -1.5590083599090576, "logps/chosen": -86.9002685546875, "logps/rejected": -152.34225463867188, "loss": 1.9385, "rewards/accuracies": 0.0, "rewards/chosen": 5.469766139984131, "rewards/margins": -1.1485400199890137, "rewards/rejected": 6.6183061599731445, "step": 3405 }, { "epoch": 0.75, "learning_rate": 7.1629571404865686e-06, "logits/chosen": -1.3767096996307373, "logits/rejected": -1.3144484758377075, "logps/chosen": -80.61407470703125, "logps/rejected": -61.55149841308594, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": 5.6761980056762695, "rewards/margins": 2.7532289028167725, "rewards/rejected": 2.922969102859497, "step": 3406 }, { "epoch": 0.75, "learning_rate": 7.161341056355255e-06, "logits/chosen": -1.2188632488250732, "logits/rejected": -1.2520140409469604, "logps/chosen": -46.23059844970703, "logps/rejected": -42.285037994384766, "loss": 1.0849, "rewards/accuracies": 1.0, "rewards/chosen": 2.6542794704437256, "rewards/margins": 0.4316830635070801, "rewards/rejected": 2.2225964069366455, "step": 3407 }, { "epoch": 0.75, "learning_rate": 7.159724694496815e-06, "logits/chosen": -1.7274456024169922, "logits/rejected": -1.6464248895645142, "logps/chosen": -60.14435577392578, "logps/rejected": -32.54154586791992, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": 6.005167484283447, "rewards/margins": 2.735847234725952, "rewards/rejected": 3.269320249557495, "step": 3408 }, { "epoch": 0.75, "learning_rate": 7.158108055118942e-06, "logits/chosen": -1.5100823640823364, "logits/rejected": -1.4575223922729492, "logps/chosen": -47.49268341064453, "logps/rejected": -31.237947463989258, "loss": 0.8489, "rewards/accuracies": 0.0, "rewards/chosen": 2.682039737701416, "rewards/margins": -1.1178641319274902, "rewards/rejected": 3.7999038696289062, "step": 3409 }, { "epoch": 0.75, "learning_rate": 7.156491138429371e-06, "logits/chosen": -1.56443190574646, "logits/rejected": -1.5841652154922485, "logps/chosen": -43.839420318603516, "logps/rejected": -43.0965461730957, "loss": 0.4167, "rewards/accuracies": 0.0, "rewards/chosen": 2.114407777786255, "rewards/margins": -0.22338247299194336, "rewards/rejected": 2.3377902507781982, "step": 3410 }, { "epoch": 0.75, "learning_rate": 7.154873944635874e-06, "logits/chosen": -1.690028190612793, "logits/rejected": -1.7372939586639404, "logps/chosen": -99.00422668457031, "logps/rejected": -131.78988647460938, "loss": 3.0056, "rewards/accuracies": 0.0, "rewards/chosen": 9.3803071975708, "rewards/margins": -2.405095100402832, "rewards/rejected": 11.785402297973633, "step": 3411 }, { "epoch": 0.76, "learning_rate": 7.153256473946254e-06, "logits/chosen": -1.7068899869918823, "logits/rejected": -1.4706904888153076, "logps/chosen": -101.7895278930664, "logps/rejected": -49.730995178222656, "loss": 0.4502, "rewards/accuracies": 1.0, "rewards/chosen": 4.347854137420654, "rewards/margins": 0.3532259464263916, "rewards/rejected": 3.9946281909942627, "step": 3412 }, { "epoch": 0.76, "learning_rate": 7.151638726568354e-06, "logits/chosen": -1.4096277952194214, "logits/rejected": -1.46901273727417, "logps/chosen": -76.721923828125, "logps/rejected": -105.25936126708984, "loss": 0.6567, "rewards/accuracies": 0.0, "rewards/chosen": 5.808480739593506, "rewards/margins": -0.9990243911743164, "rewards/rejected": 6.807505130767822, "step": 3413 }, { "epoch": 0.76, "learning_rate": 7.15002070271005e-06, "logits/chosen": -1.494202971458435, "logits/rejected": -1.494202971458435, "logps/chosen": -103.84506225585938, "logps/rejected": -103.84506225585938, "loss": 0.4422, "rewards/accuracies": 0.0, "rewards/chosen": 3.3397889137268066, "rewards/margins": 0.0, "rewards/rejected": 3.3397889137268066, "step": 3414 }, { "epoch": 0.76, "learning_rate": 7.1484024025792546e-06, "logits/chosen": -1.7402881383895874, "logits/rejected": -1.6453921794891357, "logps/chosen": -77.97267150878906, "logps/rejected": -113.41624450683594, "loss": 0.2158, "rewards/accuracies": 1.0, "rewards/chosen": 5.997586250305176, "rewards/margins": 1.8625140190124512, "rewards/rejected": 4.135072231292725, "step": 3415 }, { "epoch": 0.76, "learning_rate": 7.1467838263839155e-06, "logits/chosen": -1.7876936197280884, "logits/rejected": -1.7809085845947266, "logps/chosen": -71.6087417602539, "logps/rejected": -42.07981491088867, "loss": 0.5039, "rewards/accuracies": 1.0, "rewards/chosen": 2.6136481761932373, "rewards/margins": 0.19988727569580078, "rewards/rejected": 2.4137609004974365, "step": 3416 }, { "epoch": 0.76, "learning_rate": 7.145164974332015e-06, "logits/chosen": -1.3970732688903809, "logits/rejected": -1.3930186033248901, "logps/chosen": -21.098812103271484, "logps/rejected": -61.084190368652344, "loss": 0.7873, "rewards/accuracies": 1.0, "rewards/chosen": 2.551992893218994, "rewards/margins": 0.8071831464767456, "rewards/rejected": 1.7448097467422485, "step": 3417 }, { "epoch": 0.76, "learning_rate": 7.143545846631572e-06, "logits/chosen": -1.775152564048767, "logits/rejected": -1.693966269493103, "logps/chosen": -121.54867553710938, "logps/rejected": -48.06226348876953, "loss": 0.6194, "rewards/accuracies": 0.0, "rewards/chosen": 3.5778350830078125, "rewards/margins": -0.2828240394592285, "rewards/rejected": 3.860659122467041, "step": 3418 }, { "epoch": 0.76, "learning_rate": 7.141926443490641e-06, "logits/chosen": -1.5722987651824951, "logits/rejected": -1.4939043521881104, "logps/chosen": -52.6801643371582, "logps/rejected": -35.04275894165039, "loss": 1.6706, "rewards/accuracies": 0.0, "rewards/chosen": 3.363497495651245, "rewards/margins": -0.23338937759399414, "rewards/rejected": 3.5968868732452393, "step": 3419 }, { "epoch": 0.76, "learning_rate": 7.140306765117311e-06, "logits/chosen": -1.2815897464752197, "logits/rejected": -1.2495464086532593, "logps/chosen": -34.637672424316406, "logps/rejected": -74.10298919677734, "loss": 0.8601, "rewards/accuracies": 0.0, "rewards/chosen": 4.443929195404053, "rewards/margins": -1.2945818901062012, "rewards/rejected": 5.738511085510254, "step": 3420 }, { "epoch": 0.76, "learning_rate": 7.138686811719706e-06, "logits/chosen": -1.7024892568588257, "logits/rejected": -1.6374385356903076, "logps/chosen": -127.57096099853516, "logps/rejected": -81.4415283203125, "loss": 0.5711, "rewards/accuracies": 1.0, "rewards/chosen": 5.119104862213135, "rewards/margins": 0.49434423446655273, "rewards/rejected": 4.624760627746582, "step": 3421 }, { "epoch": 0.76, "learning_rate": 7.137066583505987e-06, "logits/chosen": -1.6079598665237427, "logits/rejected": -1.602980136871338, "logps/chosen": -37.678314208984375, "logps/rejected": -98.70033264160156, "loss": 1.3202, "rewards/accuracies": 0.0, "rewards/chosen": 3.2140884399414062, "rewards/margins": -0.32275390625, "rewards/rejected": 3.5368423461914062, "step": 3422 }, { "epoch": 0.76, "learning_rate": 7.13544608068435e-06, "logits/chosen": -2.0779542922973633, "logits/rejected": -2.0514180660247803, "logps/chosen": -78.88639831542969, "logps/rejected": -97.24954223632812, "loss": 1.6109, "rewards/accuracies": 1.0, "rewards/chosen": 8.000853538513184, "rewards/margins": 4.694690704345703, "rewards/rejected": 3.3061630725860596, "step": 3423 }, { "epoch": 0.76, "learning_rate": 7.133825303463026e-06, "logits/chosen": -1.7092221975326538, "logits/rejected": -1.6099026203155518, "logps/chosen": -102.96908569335938, "logps/rejected": -66.63766479492188, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": 7.0687456130981445, "rewards/margins": 3.2522804737091064, "rewards/rejected": 3.816465139389038, "step": 3424 }, { "epoch": 0.76, "learning_rate": 7.132204252050279e-06, "logits/chosen": -1.7963978052139282, "logits/rejected": -1.8333008289337158, "logps/chosen": -93.07560729980469, "logps/rejected": -131.69297790527344, "loss": 1.4329, "rewards/accuracies": 0.0, "rewards/chosen": 4.305029392242432, "rewards/margins": -2.7783308029174805, "rewards/rejected": 7.083360195159912, "step": 3425 }, { "epoch": 0.76, "learning_rate": 7.13058292665441e-06, "logits/chosen": -1.4985226392745972, "logits/rejected": -1.4985226392745972, "logps/chosen": -101.90391540527344, "logps/rejected": -101.90391540527344, "loss": 0.5597, "rewards/accuracies": 0.0, "rewards/chosen": 7.942070007324219, "rewards/margins": 0.0, "rewards/rejected": 7.942070007324219, "step": 3426 }, { "epoch": 0.76, "learning_rate": 7.128961327483759e-06, "logits/chosen": -1.5488961935043335, "logits/rejected": -1.424867033958435, "logps/chosen": -107.17825317382812, "logps/rejected": -40.86066818237305, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 6.488965034484863, "rewards/margins": 6.086692810058594, "rewards/rejected": 0.40227243304252625, "step": 3427 }, { "epoch": 0.76, "learning_rate": 7.127339454746695e-06, "logits/chosen": -1.207993984222412, "logits/rejected": -1.0381572246551514, "logps/chosen": -65.29971313476562, "logps/rejected": -19.222354888916016, "loss": 0.1001, "rewards/accuracies": 1.0, "rewards/chosen": 2.3034348487854004, "rewards/margins": 1.7554539442062378, "rewards/rejected": 0.5479809045791626, "step": 3428 }, { "epoch": 0.76, "learning_rate": 7.125717308651623e-06, "logits/chosen": -1.5577869415283203, "logits/rejected": -1.446492314338684, "logps/chosen": -75.10509490966797, "logps/rejected": -24.329883575439453, "loss": 0.3302, "rewards/accuracies": 1.0, "rewards/chosen": 3.5471420288085938, "rewards/margins": 1.4314167499542236, "rewards/rejected": 2.11572527885437, "step": 3429 }, { "epoch": 0.76, "learning_rate": 7.124094889406988e-06, "logits/chosen": -1.2934738397598267, "logits/rejected": -1.2498165369033813, "logps/chosen": -52.77979278564453, "logps/rejected": -80.92774963378906, "loss": 0.3298, "rewards/accuracies": 1.0, "rewards/chosen": 3.0914466381073, "rewards/margins": 1.2685065269470215, "rewards/rejected": 1.8229401111602783, "step": 3430 }, { "epoch": 0.76, "learning_rate": 7.122472197221266e-06, "logits/chosen": -1.6743805408477783, "logits/rejected": -1.6743805408477783, "logps/chosen": -39.52326202392578, "logps/rejected": -39.52326202392578, "loss": 0.4475, "rewards/accuracies": 0.0, "rewards/chosen": 3.370581865310669, "rewards/margins": 0.0, "rewards/rejected": 3.370581865310669, "step": 3431 }, { "epoch": 0.76, "learning_rate": 7.12084923230297e-06, "logits/chosen": -1.8385306596755981, "logits/rejected": -1.7745232582092285, "logps/chosen": -54.16437530517578, "logps/rejected": -41.972679138183594, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": 4.1697893142700195, "rewards/margins": 2.699474811553955, "rewards/rejected": 1.470314383506775, "step": 3432 }, { "epoch": 0.76, "learning_rate": 7.119225994860646e-06, "logits/chosen": -1.486945390701294, "logits/rejected": -1.523357629776001, "logps/chosen": -46.44930648803711, "logps/rejected": -86.67402648925781, "loss": 1.8308, "rewards/accuracies": 0.0, "rewards/chosen": 2.4508023262023926, "rewards/margins": -3.3968677520751953, "rewards/rejected": 5.847670078277588, "step": 3433 }, { "epoch": 0.76, "learning_rate": 7.1176024851028745e-06, "logits/chosen": -1.6343469619750977, "logits/rejected": -1.7040975093841553, "logps/chosen": -58.17568588256836, "logps/rejected": -133.44717407226562, "loss": 2.6148, "rewards/accuracies": 0.0, "rewards/chosen": 2.9643466472625732, "rewards/margins": -4.714634895324707, "rewards/rejected": 7.678981304168701, "step": 3434 }, { "epoch": 0.76, "learning_rate": 7.115978703238275e-06, "logits/chosen": -1.2835463285446167, "logits/rejected": -1.171203374862671, "logps/chosen": -69.92362976074219, "logps/rejected": -74.8978042602539, "loss": 0.7406, "rewards/accuracies": 1.0, "rewards/chosen": 3.17303466796875, "rewards/margins": 2.5554680824279785, "rewards/rejected": 0.617566704750061, "step": 3435 }, { "epoch": 0.76, "learning_rate": 7.114354649475499e-06, "logits/chosen": -1.9930784702301025, "logits/rejected": -1.9728004932403564, "logps/chosen": -77.63145446777344, "logps/rejected": -62.48078536987305, "loss": 0.2417, "rewards/accuracies": 1.0, "rewards/chosen": 6.598487854003906, "rewards/margins": 3.0330898761749268, "rewards/rejected": 3.5653979778289795, "step": 3436 }, { "epoch": 0.76, "learning_rate": 7.112730324023234e-06, "logits/chosen": -1.7085872888565063, "logits/rejected": -1.5909086465835571, "logps/chosen": -97.34294128417969, "logps/rejected": -55.59014129638672, "loss": 0.9155, "rewards/accuracies": 1.0, "rewards/chosen": 7.884181499481201, "rewards/margins": 4.984929084777832, "rewards/rejected": 2.899252414703369, "step": 3437 }, { "epoch": 0.76, "learning_rate": 7.111105727090199e-06, "logits/chosen": -1.6677370071411133, "logits/rejected": -1.6399931907653809, "logps/chosen": -132.78387451171875, "logps/rejected": -138.13778686523438, "loss": 0.4435, "rewards/accuracies": 0.0, "rewards/chosen": 6.501628398895264, "rewards/margins": -0.13768434524536133, "rewards/rejected": 6.639312744140625, "step": 3438 }, { "epoch": 0.76, "learning_rate": 7.109480858885155e-06, "logits/chosen": -1.7423678636550903, "logits/rejected": -1.6872153282165527, "logps/chosen": -58.07971954345703, "logps/rejected": -93.43992614746094, "loss": 0.3829, "rewards/accuracies": 1.0, "rewards/chosen": 3.0205185413360596, "rewards/margins": 0.4199485778808594, "rewards/rejected": 2.6005699634552, "step": 3439 }, { "epoch": 0.76, "learning_rate": 7.107855719616891e-06, "logits/chosen": -1.8091330528259277, "logits/rejected": -1.7347902059555054, "logps/chosen": -108.37310791015625, "logps/rejected": -50.77622985839844, "loss": 0.0922, "rewards/accuracies": 1.0, "rewards/chosen": 7.377905368804932, "rewards/margins": 1.7201933860778809, "rewards/rejected": 5.657711982727051, "step": 3440 }, { "epoch": 0.76, "learning_rate": 7.106230309494234e-06, "logits/chosen": -1.8654916286468506, "logits/rejected": -1.7870670557022095, "logps/chosen": -51.25833511352539, "logps/rejected": -22.076885223388672, "loss": 0.1031, "rewards/accuracies": 1.0, "rewards/chosen": 3.263828754425049, "rewards/margins": 2.6420719623565674, "rewards/rejected": 0.6217567324638367, "step": 3441 }, { "epoch": 0.76, "learning_rate": 7.104604628726046e-06, "logits/chosen": -1.6312743425369263, "logits/rejected": -1.6312743425369263, "logps/chosen": -54.41541290283203, "logps/rejected": -54.41541290283203, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 3.4838006496429443, "rewards/margins": 0.0, "rewards/rejected": 3.4838006496429443, "step": 3442 }, { "epoch": 0.76, "learning_rate": 7.102978677521223e-06, "logits/chosen": -1.6393334865570068, "logits/rejected": -1.5930397510528564, "logps/chosen": -56.66825866699219, "logps/rejected": -46.69340515136719, "loss": 1.0476, "rewards/accuracies": 0.0, "rewards/chosen": 1.8086708784103394, "rewards/margins": -0.30065906047821045, "rewards/rejected": 2.10932993888855, "step": 3443 }, { "epoch": 0.76, "learning_rate": 7.1013524560886935e-06, "logits/chosen": -1.3227416276931763, "logits/rejected": -1.138964056968689, "logps/chosen": -82.61318969726562, "logps/rejected": -22.974376678466797, "loss": 0.8004, "rewards/accuracies": 1.0, "rewards/chosen": 4.905592441558838, "rewards/margins": 4.731997489929199, "rewards/rejected": 0.17359505593776703, "step": 3444 }, { "epoch": 0.76, "learning_rate": 7.099725964637426e-06, "logits/chosen": -1.5017017126083374, "logits/rejected": -1.459434986114502, "logps/chosen": -55.14965057373047, "logps/rejected": -74.83291625976562, "loss": 0.2844, "rewards/accuracies": 1.0, "rewards/chosen": 2.887383222579956, "rewards/margins": 0.382110595703125, "rewards/rejected": 2.505272626876831, "step": 3445 }, { "epoch": 0.76, "learning_rate": 7.098099203376419e-06, "logits/chosen": -1.6382102966308594, "logits/rejected": -1.68038809299469, "logps/chosen": -45.42510986328125, "logps/rejected": -87.00009155273438, "loss": 2.1899, "rewards/accuracies": 0.0, "rewards/chosen": 3.0436203479766846, "rewards/margins": -2.599186658859253, "rewards/rejected": 5.6428070068359375, "step": 3446 }, { "epoch": 0.76, "learning_rate": 7.09647217251471e-06, "logits/chosen": -1.4544391632080078, "logits/rejected": -1.4544391632080078, "logps/chosen": -16.43389129638672, "logps/rejected": -16.43389129638672, "loss": 1.0893, "rewards/accuracies": 0.0, "rewards/chosen": 1.3738352060317993, "rewards/margins": 0.0, "rewards/rejected": 1.3738352060317993, "step": 3447 }, { "epoch": 0.76, "learning_rate": 7.094844872261366e-06, "logits/chosen": -1.7890732288360596, "logits/rejected": -1.740139365196228, "logps/chosen": -85.69020080566406, "logps/rejected": -67.2750244140625, "loss": 0.4791, "rewards/accuracies": 1.0, "rewards/chosen": 3.1741912364959717, "rewards/margins": 0.7653822898864746, "rewards/rejected": 2.408808946609497, "step": 3448 }, { "epoch": 0.76, "learning_rate": 7.09321730282549e-06, "logits/chosen": -1.603322148323059, "logits/rejected": -1.5711686611175537, "logps/chosen": -50.76936340332031, "logps/rejected": -16.950881958007812, "loss": 0.0987, "rewards/accuracies": 1.0, "rewards/chosen": 2.7700562477111816, "rewards/margins": 2.519078254699707, "rewards/rejected": 0.2509780824184418, "step": 3449 }, { "epoch": 0.76, "learning_rate": 7.091589464416225e-06, "logits/chosen": -1.4356216192245483, "logits/rejected": -1.4161118268966675, "logps/chosen": -18.613677978515625, "logps/rejected": -5.43142032623291, "loss": 1.2845, "rewards/accuracies": 1.0, "rewards/chosen": 2.2953872680664062, "rewards/margins": 1.4557241201400757, "rewards/rejected": 0.8396631479263306, "step": 3450 }, { "epoch": 0.76, "learning_rate": 7.08996135724274e-06, "logits/chosen": -1.6677335500717163, "logits/rejected": -1.5734905004501343, "logps/chosen": -104.48251342773438, "logps/rejected": -71.38818359375, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": 7.631036281585693, "rewards/margins": 3.0718588829040527, "rewards/rejected": 4.559177398681641, "step": 3451 }, { "epoch": 0.76, "learning_rate": 7.088332981514245e-06, "logits/chosen": -1.632152795791626, "logits/rejected": -1.5598535537719727, "logps/chosen": -108.09196472167969, "logps/rejected": -18.411203384399414, "loss": 0.8151, "rewards/accuracies": 1.0, "rewards/chosen": 3.142909288406372, "rewards/margins": 0.5651211738586426, "rewards/rejected": 2.5777881145477295, "step": 3452 }, { "epoch": 0.76, "learning_rate": 7.086704337439982e-06, "logits/chosen": -1.5256078243255615, "logits/rejected": -1.4856868982315063, "logps/chosen": -63.13279342651367, "logps/rejected": -43.87403869628906, "loss": 0.3817, "rewards/accuracies": 1.0, "rewards/chosen": 3.213031530380249, "rewards/margins": 0.015006065368652344, "rewards/rejected": 3.1980254650115967, "step": 3453 }, { "epoch": 0.76, "learning_rate": 7.0850754252292285e-06, "logits/chosen": -1.7771503925323486, "logits/rejected": -1.5948079824447632, "logps/chosen": -105.33207702636719, "logps/rejected": -25.6152400970459, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 9.125508308410645, "rewards/margins": 9.001938819885254, "rewards/rejected": 0.1235693022608757, "step": 3454 }, { "epoch": 0.76, "learning_rate": 7.0834462450912955e-06, "logits/chosen": -1.5161319971084595, "logits/rejected": -1.4672645330429077, "logps/chosen": -138.4361572265625, "logps/rejected": -124.93089294433594, "loss": 1.4572, "rewards/accuracies": 0.0, "rewards/chosen": 4.309121608734131, "rewards/margins": -1.877908706665039, "rewards/rejected": 6.18703031539917, "step": 3455 }, { "epoch": 0.76, "learning_rate": 7.081816797235528e-06, "logits/chosen": -1.622395396232605, "logits/rejected": -1.5705170631408691, "logps/chosen": -116.13447570800781, "logps/rejected": -202.91143798828125, "loss": 1.3722, "rewards/accuracies": 0.0, "rewards/chosen": 5.0901689529418945, "rewards/margins": -1.014085292816162, "rewards/rejected": 6.104254245758057, "step": 3456 }, { "epoch": 0.77, "learning_rate": 7.080187081871307e-06, "logits/chosen": -1.7595787048339844, "logits/rejected": -1.7901389598846436, "logps/chosen": -78.6016845703125, "logps/rejected": -125.35566711425781, "loss": 1.2092, "rewards/accuracies": 0.0, "rewards/chosen": 6.613517761230469, "rewards/margins": -2.149317741394043, "rewards/rejected": 8.762835502624512, "step": 3457 }, { "epoch": 0.77, "learning_rate": 7.0785570992080455e-06, "logits/chosen": -1.2729032039642334, "logits/rejected": -1.3013752698898315, "logps/chosen": -59.8676643371582, "logps/rejected": -56.46706008911133, "loss": 1.1429, "rewards/accuracies": 0.0, "rewards/chosen": 2.8862881660461426, "rewards/margins": -1.4430060386657715, "rewards/rejected": 4.329294204711914, "step": 3458 }, { "epoch": 0.77, "learning_rate": 7.076926849455196e-06, "logits/chosen": -1.5488866567611694, "logits/rejected": -1.5109658241271973, "logps/chosen": -105.63580322265625, "logps/rejected": -83.85071563720703, "loss": 0.5101, "rewards/accuracies": 0.0, "rewards/chosen": 3.5556182861328125, "rewards/margins": -0.5719232559204102, "rewards/rejected": 4.127541542053223, "step": 3459 }, { "epoch": 0.77, "learning_rate": 7.0752963328222366e-06, "logits/chosen": -1.8960168361663818, "logits/rejected": -1.8318798542022705, "logps/chosen": -34.237266540527344, "logps/rejected": -8.793570518493652, "loss": 0.0932, "rewards/accuracies": 1.0, "rewards/chosen": 3.1366326808929443, "rewards/margins": 2.5937509536743164, "rewards/rejected": 0.5428817868232727, "step": 3460 }, { "epoch": 0.77, "learning_rate": 7.073665549518688e-06, "logits/chosen": -1.7431395053863525, "logits/rejected": -1.675571322441101, "logps/chosen": -46.501304626464844, "logps/rejected": -6.028129577636719, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 2.7184815406799316, "rewards/margins": 2.092576026916504, "rewards/rejected": 0.6259055137634277, "step": 3461 }, { "epoch": 0.77, "learning_rate": 7.072034499754102e-06, "logits/chosen": -1.713326334953308, "logits/rejected": -1.7168118953704834, "logps/chosen": -72.08369445800781, "logps/rejected": -56.72156524658203, "loss": 0.1548, "rewards/accuracies": 1.0, "rewards/chosen": 2.7735321521759033, "rewards/margins": 1.2116035223007202, "rewards/rejected": 1.561928629875183, "step": 3462 }, { "epoch": 0.77, "learning_rate": 7.070403183738062e-06, "logits/chosen": -1.4761136770248413, "logits/rejected": -1.4780441522598267, "logps/chosen": -51.939002990722656, "logps/rejected": -26.104949951171875, "loss": 0.7069, "rewards/accuracies": 1.0, "rewards/chosen": 2.9975318908691406, "rewards/margins": 0.18056178092956543, "rewards/rejected": 2.816970109939575, "step": 3463 }, { "epoch": 0.77, "learning_rate": 7.068771601680191e-06, "logits/chosen": -1.3379323482513428, "logits/rejected": -1.2625480890274048, "logps/chosen": -27.10259437561035, "logps/rejected": -43.4000358581543, "loss": 0.2392, "rewards/accuracies": 1.0, "rewards/chosen": 2.4525582790374756, "rewards/margins": 0.5048432350158691, "rewards/rejected": 1.9477150440216064, "step": 3464 }, { "epoch": 0.77, "learning_rate": 7.067139753790142e-06, "logits/chosen": -1.2973140478134155, "logits/rejected": -1.2633682489395142, "logps/chosen": -35.77114486694336, "logps/rejected": -36.640045166015625, "loss": 0.3574, "rewards/accuracies": 0.0, "rewards/chosen": 2.219409704208374, "rewards/margins": -0.025631189346313477, "rewards/rejected": 2.2450408935546875, "step": 3465 }, { "epoch": 0.77, "learning_rate": 7.065507640277605e-06, "logits/chosen": -1.8133492469787598, "logits/rejected": -1.7783029079437256, "logps/chosen": -46.60459899902344, "logps/rejected": -12.281371116638184, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": 3.62986159324646, "rewards/margins": 2.878709316253662, "rewards/rejected": 0.7511521577835083, "step": 3466 }, { "epoch": 0.77, "learning_rate": 7.063875261352301e-06, "logits/chosen": -1.529679775238037, "logits/rejected": -1.529679775238037, "logps/chosen": -14.073309898376465, "logps/rejected": -14.073309898376465, "loss": 0.5906, "rewards/accuracies": 0.0, "rewards/chosen": 0.7722250819206238, "rewards/margins": 0.0, "rewards/rejected": 0.7722250819206238, "step": 3467 }, { "epoch": 0.77, "learning_rate": 7.0622426172239875e-06, "logits/chosen": -1.4323760271072388, "logits/rejected": -1.5057826042175293, "logps/chosen": -168.82412719726562, "logps/rejected": -134.27569580078125, "loss": 0.1171, "rewards/accuracies": 1.0, "rewards/chosen": 7.795053005218506, "rewards/margins": 3.434988021850586, "rewards/rejected": 4.36006498336792, "step": 3468 }, { "epoch": 0.77, "learning_rate": 7.060609708102455e-06, "logits/chosen": -1.9153822660446167, "logits/rejected": -1.8964029550552368, "logps/chosen": -158.90289306640625, "logps/rejected": -126.86962127685547, "loss": 1.176, "rewards/accuracies": 0.0, "rewards/chosen": 6.593573093414307, "rewards/margins": -2.2401909828186035, "rewards/rejected": 8.83376407623291, "step": 3469 }, { "epoch": 0.77, "learning_rate": 7.058976534197528e-06, "logits/chosen": -1.7256957292556763, "logits/rejected": -1.728134036064148, "logps/chosen": -26.65362548828125, "logps/rejected": -16.89760398864746, "loss": 0.4922, "rewards/accuracies": 1.0, "rewards/chosen": 0.9215477108955383, "rewards/margins": 0.7371566891670227, "rewards/rejected": 0.18439102172851562, "step": 3470 }, { "epoch": 0.77, "learning_rate": 7.057343095719067e-06, "logits/chosen": -1.552555799484253, "logits/rejected": -1.3855706453323364, "logps/chosen": -109.94371032714844, "logps/rejected": -69.36953735351562, "loss": 0.135, "rewards/accuracies": 1.0, "rewards/chosen": 5.82052755355835, "rewards/margins": 1.2730751037597656, "rewards/rejected": 4.547452449798584, "step": 3471 }, { "epoch": 0.77, "learning_rate": 7.055709392876964e-06, "logits/chosen": -1.5100789070129395, "logits/rejected": -1.4574780464172363, "logps/chosen": -83.60137939453125, "logps/rejected": -53.817108154296875, "loss": 0.16, "rewards/accuracies": 1.0, "rewards/chosen": 4.260581970214844, "rewards/margins": 1.1869597434997559, "rewards/rejected": 3.073622226715088, "step": 3472 }, { "epoch": 0.77, "learning_rate": 7.054075425881144e-06, "logits/chosen": -1.6230300664901733, "logits/rejected": -1.5042836666107178, "logps/chosen": -97.124267578125, "logps/rejected": -45.60643005371094, "loss": 0.4669, "rewards/accuracies": 1.0, "rewards/chosen": 6.148614406585693, "rewards/margins": 3.7565977573394775, "rewards/rejected": 2.392016649246216, "step": 3473 }, { "epoch": 0.77, "learning_rate": 7.052441194941571e-06, "logits/chosen": -1.719506859779358, "logits/rejected": -1.6825695037841797, "logps/chosen": -41.20100784301758, "logps/rejected": -20.074302673339844, "loss": 0.5249, "rewards/accuracies": 0.0, "rewards/chosen": 2.1987760066986084, "rewards/margins": -0.3708765506744385, "rewards/rejected": 2.569652557373047, "step": 3474 }, { "epoch": 0.77, "learning_rate": 7.050806700268239e-06, "logits/chosen": -1.7401683330535889, "logits/rejected": -1.6988682746887207, "logps/chosen": -39.245670318603516, "logps/rejected": -53.62385559082031, "loss": 0.9239, "rewards/accuracies": 1.0, "rewards/chosen": 4.040987014770508, "rewards/margins": 1.6911029815673828, "rewards/rejected": 2.349884033203125, "step": 3475 }, { "epoch": 0.77, "learning_rate": 7.049171942071176e-06, "logits/chosen": -1.3865290880203247, "logits/rejected": -1.1929305791854858, "logps/chosen": -129.99737548828125, "logps/rejected": -72.78704071044922, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": 6.4928131103515625, "rewards/margins": 3.050389051437378, "rewards/rejected": 3.4424240589141846, "step": 3476 }, { "epoch": 0.77, "learning_rate": 7.0475369205604435e-06, "logits/chosen": -1.4581220149993896, "logits/rejected": -1.4183741807937622, "logps/chosen": -90.1030502319336, "logps/rejected": -58.65174865722656, "loss": 0.1454, "rewards/accuracies": 1.0, "rewards/chosen": 5.408731937408447, "rewards/margins": 1.3373284339904785, "rewards/rejected": 4.071403503417969, "step": 3477 }, { "epoch": 0.77, "learning_rate": 7.04590163594614e-06, "logits/chosen": -1.5320020914077759, "logits/rejected": -1.489234209060669, "logps/chosen": -34.4725227355957, "logps/rejected": -51.31586456298828, "loss": 0.476, "rewards/accuracies": 0.0, "rewards/chosen": 2.796847105026245, "rewards/margins": -0.3262059688568115, "rewards/rejected": 3.1230530738830566, "step": 3478 }, { "epoch": 0.77, "learning_rate": 7.044266088438393e-06, "logits/chosen": -1.4365845918655396, "logits/rejected": -1.409213900566101, "logps/chosen": -28.935089111328125, "logps/rejected": -35.30615997314453, "loss": 0.5343, "rewards/accuracies": 0.0, "rewards/chosen": 2.1535027027130127, "rewards/margins": -0.2980036735534668, "rewards/rejected": 2.4515063762664795, "step": 3479 }, { "epoch": 0.77, "learning_rate": 7.0426302782473676e-06, "logits/chosen": -1.5689655542373657, "logits/rejected": -1.441520094871521, "logps/chosen": -42.22381591796875, "logps/rejected": -21.868385314941406, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 3.2055556774139404, "rewards/margins": 1.7040679454803467, "rewards/rejected": 1.5014877319335938, "step": 3480 }, { "epoch": 0.77, "learning_rate": 7.040994205583263e-06, "logits/chosen": -1.797196626663208, "logits/rejected": -1.762326955795288, "logps/chosen": -52.601497650146484, "logps/rejected": -73.12277221679688, "loss": 1.1028, "rewards/accuracies": 0.0, "rewards/chosen": 1.9042164087295532, "rewards/margins": -0.44083893299102783, "rewards/rejected": 2.345055341720581, "step": 3481 }, { "epoch": 0.77, "learning_rate": 7.03935787065631e-06, "logits/chosen": -1.8506667613983154, "logits/rejected": -1.7881954908370972, "logps/chosen": -67.23484802246094, "logps/rejected": -59.90848159790039, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 2.6466004848480225, "rewards/margins": 0.6303441524505615, "rewards/rejected": 2.016256332397461, "step": 3482 }, { "epoch": 0.77, "learning_rate": 7.037721273676772e-06, "logits/chosen": -1.6021000146865845, "logits/rejected": -1.6049343347549438, "logps/chosen": -51.282100677490234, "logps/rejected": -78.34440612792969, "loss": 0.584, "rewards/accuracies": 0.0, "rewards/chosen": 3.639791488647461, "rewards/margins": -0.7919421195983887, "rewards/rejected": 4.43173360824585, "step": 3483 }, { "epoch": 0.77, "learning_rate": 7.036084414854949e-06, "logits/chosen": -1.6651617288589478, "logits/rejected": -1.6313579082489014, "logps/chosen": -90.77479553222656, "logps/rejected": -46.801780700683594, "loss": 0.5347, "rewards/accuracies": 0.0, "rewards/chosen": 3.6961076259613037, "rewards/margins": -0.5504324436187744, "rewards/rejected": 4.246540069580078, "step": 3484 }, { "epoch": 0.77, "learning_rate": 7.034447294401173e-06, "logits/chosen": -1.701216459274292, "logits/rejected": -1.676941990852356, "logps/chosen": -58.043113708496094, "logps/rejected": -85.72052764892578, "loss": 1.7278, "rewards/accuracies": 0.0, "rewards/chosen": 2.9220635890960693, "rewards/margins": -3.4045350551605225, "rewards/rejected": 6.326598644256592, "step": 3485 }, { "epoch": 0.77, "learning_rate": 7.032809912525811e-06, "logits/chosen": -1.6863529682159424, "logits/rejected": -1.6386224031448364, "logps/chosen": -107.84873962402344, "logps/rejected": -61.42200469970703, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 7.98745584487915, "rewards/margins": 5.610044479370117, "rewards/rejected": 2.3774116039276123, "step": 3486 }, { "epoch": 0.77, "learning_rate": 7.031172269439262e-06, "logits/chosen": -1.5159289836883545, "logits/rejected": -1.4980370998382568, "logps/chosen": -33.50599670410156, "logps/rejected": -36.08025360107422, "loss": 0.6572, "rewards/accuracies": 1.0, "rewards/chosen": 2.4814231395721436, "rewards/margins": 0.5971900224685669, "rewards/rejected": 1.8842331171035767, "step": 3487 }, { "epoch": 0.77, "learning_rate": 7.0295343653519585e-06, "logits/chosen": -1.4218641519546509, "logits/rejected": -1.3009952306747437, "logps/chosen": -24.00655174255371, "logps/rejected": -11.832462310791016, "loss": 0.3144, "rewards/accuracies": 1.0, "rewards/chosen": 1.893733024597168, "rewards/margins": 0.8715239763259888, "rewards/rejected": 1.0222090482711792, "step": 3488 }, { "epoch": 0.77, "learning_rate": 7.027896200474369e-06, "logits/chosen": -1.4632786512374878, "logits/rejected": -1.388762354850769, "logps/chosen": -25.519739151000977, "logps/rejected": -9.014381408691406, "loss": 0.1427, "rewards/accuracies": 1.0, "rewards/chosen": 2.4497835636138916, "rewards/margins": 1.1699622869491577, "rewards/rejected": 1.2798212766647339, "step": 3489 }, { "epoch": 0.77, "learning_rate": 7.026257775016991e-06, "logits/chosen": -1.514499545097351, "logits/rejected": -1.514499545097351, "logps/chosen": -9.043085098266602, "logps/rejected": -9.043085098266602, "loss": 1.2624, "rewards/accuracies": 0.0, "rewards/chosen": 1.551527976989746, "rewards/margins": 0.0, "rewards/rejected": 1.551527976989746, "step": 3490 }, { "epoch": 0.77, "learning_rate": 7.024619089190361e-06, "logits/chosen": -1.6706750392913818, "logits/rejected": -1.687203288078308, "logps/chosen": -110.42025756835938, "logps/rejected": -96.0555648803711, "loss": 0.4751, "rewards/accuracies": 1.0, "rewards/chosen": 7.45454740524292, "rewards/margins": 1.5205621719360352, "rewards/rejected": 5.933985233306885, "step": 3491 }, { "epoch": 0.77, "learning_rate": 7.022980143205046e-06, "logits/chosen": -1.760565161705017, "logits/rejected": -1.760565161705017, "logps/chosen": -65.45458984375, "logps/rejected": -65.45458984375, "loss": 0.7391, "rewards/accuracies": 0.0, "rewards/chosen": 5.492687225341797, "rewards/margins": 0.0, "rewards/rejected": 5.492687225341797, "step": 3492 }, { "epoch": 0.77, "learning_rate": 7.021340937271645e-06, "logits/chosen": -1.6596344709396362, "logits/rejected": -1.533809781074524, "logps/chosen": -78.9852066040039, "logps/rejected": -53.8323974609375, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 7.476160526275635, "rewards/margins": 2.7343153953552246, "rewards/rejected": 4.74184513092041, "step": 3493 }, { "epoch": 0.77, "learning_rate": 7.019701471600792e-06, "logits/chosen": -1.9201821088790894, "logits/rejected": -1.8574893474578857, "logps/chosen": -91.90826416015625, "logps/rejected": -134.2202911376953, "loss": 1.2406, "rewards/accuracies": 0.0, "rewards/chosen": 6.259219646453857, "rewards/margins": -2.3518309593200684, "rewards/rejected": 8.611050605773926, "step": 3494 }, { "epoch": 0.77, "learning_rate": 7.018061746403155e-06, "logits/chosen": -1.4752238988876343, "logits/rejected": -1.3912955522537231, "logps/chosen": -22.499374389648438, "logps/rejected": -18.01593017578125, "loss": 0.4146, "rewards/accuracies": 1.0, "rewards/chosen": 3.2509663105010986, "rewards/margins": 1.859850287437439, "rewards/rejected": 1.3911160230636597, "step": 3495 }, { "epoch": 0.77, "learning_rate": 7.016421761889436e-06, "logits/chosen": -1.6208816766738892, "logits/rejected": -1.568771481513977, "logps/chosen": -33.760032653808594, "logps/rejected": -61.065582275390625, "loss": 0.404, "rewards/accuracies": 0.0, "rewards/chosen": 2.8814847469329834, "rewards/margins": -0.11517977714538574, "rewards/rejected": 2.996664524078369, "step": 3496 }, { "epoch": 0.77, "learning_rate": 7.014781518270367e-06, "logits/chosen": -1.7530508041381836, "logits/rejected": -1.7582793235778809, "logps/chosen": -125.88668060302734, "logps/rejected": -108.11221313476562, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 9.586353302001953, "rewards/margins": 3.8670096397399902, "rewards/rejected": 5.719343662261963, "step": 3497 }, { "epoch": 0.77, "learning_rate": 7.013141015756717e-06, "logits/chosen": -1.6458015441894531, "logits/rejected": -1.6368812322616577, "logps/chosen": -45.4224853515625, "logps/rejected": -80.25419616699219, "loss": 1.5665, "rewards/accuracies": 0.0, "rewards/chosen": 2.4823265075683594, "rewards/margins": -2.652808666229248, "rewards/rejected": 5.135135173797607, "step": 3498 }, { "epoch": 0.77, "learning_rate": 7.011500254559286e-06, "logits/chosen": -1.5528110265731812, "logits/rejected": -1.5104551315307617, "logps/chosen": -88.79444122314453, "logps/rejected": -82.3818359375, "loss": 1.3748, "rewards/accuracies": 1.0, "rewards/chosen": 5.9564032554626465, "rewards/margins": 1.692988395690918, "rewards/rejected": 4.2634148597717285, "step": 3499 }, { "epoch": 0.77, "learning_rate": 7.0098592348889074e-06, "logits/chosen": -1.8539822101593018, "logits/rejected": -1.8447003364562988, "logps/chosen": -174.09408569335938, "logps/rejected": -73.07125091552734, "loss": 0.4049, "rewards/accuracies": 1.0, "rewards/chosen": 7.463081359863281, "rewards/margins": 2.768430233001709, "rewards/rejected": 4.694651126861572, "step": 3500 }, { "epoch": 0.77, "learning_rate": 7.008217956956449e-06, "logits/chosen": -1.6921663284301758, "logits/rejected": -1.6921663284301758, "logps/chosen": -51.32286071777344, "logps/rejected": -51.32286071777344, "loss": 0.7458, "rewards/accuracies": 0.0, "rewards/chosen": 2.148815870285034, "rewards/margins": 0.0, "rewards/rejected": 2.148815870285034, "step": 3501 }, { "epoch": 0.78, "learning_rate": 7.00657642097281e-06, "logits/chosen": -1.7229620218276978, "logits/rejected": -1.6790512800216675, "logps/chosen": -87.98664855957031, "logps/rejected": -108.12123107910156, "loss": 0.1695, "rewards/accuracies": 1.0, "rewards/chosen": 4.325778484344482, "rewards/margins": 1.2554337978363037, "rewards/rejected": 3.0703446865081787, "step": 3502 }, { "epoch": 0.78, "learning_rate": 7.004934627148925e-06, "logits/chosen": -1.5078067779541016, "logits/rejected": -1.5375056266784668, "logps/chosen": -106.74786376953125, "logps/rejected": -72.42617797851562, "loss": 1.717, "rewards/accuracies": 0.0, "rewards/chosen": 3.9320297241210938, "rewards/margins": -2.422490119934082, "rewards/rejected": 6.354519844055176, "step": 3503 }, { "epoch": 0.78, "learning_rate": 7.003292575695761e-06, "logits/chosen": -1.6287857294082642, "logits/rejected": -1.5992257595062256, "logps/chosen": -68.96847534179688, "logps/rejected": -74.91117095947266, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 7.330081462860107, "rewards/margins": 4.106799125671387, "rewards/rejected": 3.2232825756073, "step": 3504 }, { "epoch": 0.78, "learning_rate": 7.001650266824315e-06, "logits/chosen": -1.6455625295639038, "logits/rejected": -1.5223438739776611, "logps/chosen": -93.62808227539062, "logps/rejected": -54.66264343261719, "loss": 0.033, "rewards/accuracies": 1.0, "rewards/chosen": 5.62387228012085, "rewards/margins": 2.812854528427124, "rewards/rejected": 2.8110177516937256, "step": 3505 }, { "epoch": 0.78, "learning_rate": 7.000007700745622e-06, "logits/chosen": -1.8413012027740479, "logits/rejected": -1.803337574005127, "logps/chosen": -51.33417510986328, "logps/rejected": -67.58892822265625, "loss": 2.3386, "rewards/accuracies": 1.0, "rewards/chosen": 2.819629669189453, "rewards/margins": 0.054329633712768555, "rewards/rejected": 2.7653000354766846, "step": 3506 }, { "epoch": 0.78, "learning_rate": 6.998364877670748e-06, "logits/chosen": -1.6152235269546509, "logits/rejected": -1.5615590810775757, "logps/chosen": -67.09858703613281, "logps/rejected": -44.21566390991211, "loss": 1.1116, "rewards/accuracies": 0.0, "rewards/chosen": 2.1051881313323975, "rewards/margins": -1.2446222305297852, "rewards/rejected": 3.3498103618621826, "step": 3507 }, { "epoch": 0.78, "learning_rate": 6.996721797810791e-06, "logits/chosen": -1.672907829284668, "logits/rejected": -1.7350014448165894, "logps/chosen": -68.88481140136719, "logps/rejected": -66.95512390136719, "loss": 1.5722, "rewards/accuracies": 0.0, "rewards/chosen": 4.088134765625, "rewards/margins": -3.060978889465332, "rewards/rejected": 7.149113655090332, "step": 3508 }, { "epoch": 0.78, "learning_rate": 6.9950784613768855e-06, "logits/chosen": -1.5508989095687866, "logits/rejected": -1.5095512866973877, "logps/chosen": -82.6518325805664, "logps/rejected": -37.59610366821289, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 6.874210357666016, "rewards/margins": 3.6854770183563232, "rewards/rejected": 3.1887333393096924, "step": 3509 }, { "epoch": 0.78, "learning_rate": 6.99343486858019e-06, "logits/chosen": -1.544751524925232, "logits/rejected": -1.459375023841858, "logps/chosen": -90.06414794921875, "logps/rejected": -69.97911071777344, "loss": 0.4984, "rewards/accuracies": 1.0, "rewards/chosen": 6.788055419921875, "rewards/margins": 3.4587080478668213, "rewards/rejected": 3.3293473720550537, "step": 3510 }, { "epoch": 0.78, "learning_rate": 6.991791019631907e-06, "logits/chosen": -1.4699290990829468, "logits/rejected": -1.34355628490448, "logps/chosen": -60.060333251953125, "logps/rejected": -25.407657623291016, "loss": 1.517, "rewards/accuracies": 1.0, "rewards/chosen": 1.5956214666366577, "rewards/margins": 0.7271403670310974, "rewards/rejected": 0.8684810996055603, "step": 3511 }, { "epoch": 0.78, "learning_rate": 6.990146914743266e-06, "logits/chosen": -1.4320544004440308, "logits/rejected": -1.559907078742981, "logps/chosen": -31.13165855407715, "logps/rejected": -95.22039031982422, "loss": 4.0931, "rewards/accuracies": 0.0, "rewards/chosen": 2.253563642501831, "rewards/margins": -7.929408073425293, "rewards/rejected": 10.182971954345703, "step": 3512 }, { "epoch": 0.78, "learning_rate": 6.988502554125531e-06, "logits/chosen": -1.7968751192092896, "logits/rejected": -1.7166141271591187, "logps/chosen": -112.3840560913086, "logps/rejected": -48.295799255371094, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": 5.682750701904297, "rewards/margins": 2.937725067138672, "rewards/rejected": 2.745025634765625, "step": 3513 }, { "epoch": 0.78, "learning_rate": 6.986857937989998e-06, "logits/chosen": -1.5447032451629639, "logits/rejected": -1.4878134727478027, "logps/chosen": -57.87260437011719, "logps/rejected": -49.017513275146484, "loss": 0.6427, "rewards/accuracies": 0.0, "rewards/chosen": 5.0992937088012695, "rewards/margins": -0.5762147903442383, "rewards/rejected": 5.675508499145508, "step": 3514 }, { "epoch": 0.78, "learning_rate": 6.985213066547996e-06, "logits/chosen": -2.014275312423706, "logits/rejected": -2.030181407928467, "logps/chosen": -75.40473937988281, "logps/rejected": -80.09259796142578, "loss": 3.3579, "rewards/accuracies": 0.0, "rewards/chosen": 3.548856496810913, "rewards/margins": -3.624157667160034, "rewards/rejected": 7.173014163970947, "step": 3515 }, { "epoch": 0.78, "learning_rate": 6.983567940010889e-06, "logits/chosen": -1.5153279304504395, "logits/rejected": -1.3784072399139404, "logps/chosen": -49.22854232788086, "logps/rejected": -37.565574645996094, "loss": 1.4361, "rewards/accuracies": 0.0, "rewards/chosen": 1.2517318725585938, "rewards/margins": -0.6596370935440063, "rewards/rejected": 1.9113689661026, "step": 3516 }, { "epoch": 0.78, "learning_rate": 6.9819225585900685e-06, "logits/chosen": -1.3547035455703735, "logits/rejected": -1.438042163848877, "logps/chosen": -39.90324401855469, "logps/rejected": -100.53181457519531, "loss": 3.4479, "rewards/accuracies": 0.0, "rewards/chosen": 2.2530593872070312, "rewards/margins": -5.220941066741943, "rewards/rejected": 7.474000453948975, "step": 3517 }, { "epoch": 0.78, "learning_rate": 6.980276922496963e-06, "logits/chosen": -1.954518437385559, "logits/rejected": -1.8620747327804565, "logps/chosen": -65.95020294189453, "logps/rejected": -39.4620361328125, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 6.9832024574279785, "rewards/margins": 4.55102014541626, "rewards/rejected": 2.4321823120117188, "step": 3518 }, { "epoch": 0.78, "learning_rate": 6.978631031943035e-06, "logits/chosen": -1.4878236055374146, "logits/rejected": -1.6278996467590332, "logps/chosen": -46.49040222167969, "logps/rejected": -64.19328308105469, "loss": 2.8101, "rewards/accuracies": 0.0, "rewards/chosen": 2.5880844593048096, "rewards/margins": -5.436546325683594, "rewards/rejected": 8.024630546569824, "step": 3519 }, { "epoch": 0.78, "learning_rate": 6.976984887139775e-06, "logits/chosen": -1.7535817623138428, "logits/rejected": -1.734861969947815, "logps/chosen": -37.81359100341797, "logps/rejected": -74.74269104003906, "loss": 0.5931, "rewards/accuracies": 0.0, "rewards/chosen": 3.248772382736206, "rewards/margins": -0.11241388320922852, "rewards/rejected": 3.3611862659454346, "step": 3520 }, { "epoch": 0.78, "learning_rate": 6.9753384882987085e-06, "logits/chosen": -1.38603937625885, "logits/rejected": -1.350374698638916, "logps/chosen": -65.45000457763672, "logps/rejected": -57.580589294433594, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 1.759102702140808, "rewards/margins": 0.1616363525390625, "rewards/rejected": 1.5974663496017456, "step": 3521 }, { "epoch": 0.78, "learning_rate": 6.973691835631396e-06, "logits/chosen": -1.4665353298187256, "logits/rejected": -1.4897652864456177, "logps/chosen": -62.964393615722656, "logps/rejected": -55.39999008178711, "loss": 1.2933, "rewards/accuracies": 0.0, "rewards/chosen": 1.7606598138809204, "rewards/margins": -1.4069567918777466, "rewards/rejected": 3.167616605758667, "step": 3522 }, { "epoch": 0.78, "learning_rate": 6.9720449293494275e-06, "logits/chosen": -1.6593540906906128, "logits/rejected": -1.6025941371917725, "logps/chosen": -62.79410171508789, "logps/rejected": -53.89532470703125, "loss": 0.2417, "rewards/accuracies": 1.0, "rewards/chosen": 3.6619656085968018, "rewards/margins": 0.7961254119873047, "rewards/rejected": 2.865840196609497, "step": 3523 }, { "epoch": 0.78, "learning_rate": 6.970397769664425e-06, "logits/chosen": -1.776602864265442, "logits/rejected": -1.7349077463150024, "logps/chosen": -117.17237091064453, "logps/rejected": -47.47873306274414, "loss": 0.9623, "rewards/accuracies": 1.0, "rewards/chosen": 6.117282867431641, "rewards/margins": 2.6221232414245605, "rewards/rejected": 3.49515962600708, "step": 3524 }, { "epoch": 0.78, "learning_rate": 6.968750356788047e-06, "logits/chosen": -1.6172741651535034, "logits/rejected": -1.6706054210662842, "logps/chosen": -91.14442443847656, "logps/rejected": -94.29605102539062, "loss": 2.1928, "rewards/accuracies": 0.0, "rewards/chosen": 5.103662014007568, "rewards/margins": -3.8916172981262207, "rewards/rejected": 8.995279312133789, "step": 3525 }, { "epoch": 0.78, "learning_rate": 6.967102690931982e-06, "logits/chosen": -1.6987402439117432, "logits/rejected": -1.6517033576965332, "logps/chosen": -184.02584838867188, "logps/rejected": -95.63449096679688, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": 9.06054973602295, "rewards/margins": 2.589449882507324, "rewards/rejected": 6.471099853515625, "step": 3526 }, { "epoch": 0.78, "learning_rate": 6.965454772307948e-06, "logits/chosen": -1.5931934118270874, "logits/rejected": -1.5610207319259644, "logps/chosen": -61.23589324951172, "logps/rejected": -87.52520751953125, "loss": 1.4165, "rewards/accuracies": 0.0, "rewards/chosen": 2.4354248046875, "rewards/margins": -1.8895492553710938, "rewards/rejected": 4.324974060058594, "step": 3527 }, { "epoch": 0.78, "learning_rate": 6.963806601127702e-06, "logits/chosen": -1.1355630159378052, "logits/rejected": -1.1058452129364014, "logps/chosen": -35.183265686035156, "logps/rejected": -60.379730224609375, "loss": 1.9432, "rewards/accuracies": 1.0, "rewards/chosen": 4.168526649475098, "rewards/margins": 1.175657033920288, "rewards/rejected": 2.9928696155548096, "step": 3528 }, { "epoch": 0.78, "learning_rate": 6.962158177603027e-06, "logits/chosen": -1.6948922872543335, "logits/rejected": -1.6948922872543335, "logps/chosen": -35.02375793457031, "logps/rejected": -35.02375793457031, "loss": 0.3584, "rewards/accuracies": 0.0, "rewards/chosen": 4.310553073883057, "rewards/margins": 0.0, "rewards/rejected": 4.310553073883057, "step": 3529 }, { "epoch": 0.78, "learning_rate": 6.960509501945744e-06, "logits/chosen": -1.574090600013733, "logits/rejected": -1.574090600013733, "logps/chosen": -33.51548767089844, "logps/rejected": -33.51548767089844, "loss": 0.9131, "rewards/accuracies": 0.0, "rewards/chosen": 4.095900058746338, "rewards/margins": 0.0, "rewards/rejected": 4.095900058746338, "step": 3530 }, { "epoch": 0.78, "learning_rate": 6.958860574367703e-06, "logits/chosen": -1.5799192190170288, "logits/rejected": -1.5799192190170288, "logps/chosen": -50.72866439819336, "logps/rejected": -50.72866439819336, "loss": 1.7375, "rewards/accuracies": 0.0, "rewards/chosen": 5.165411949157715, "rewards/margins": 0.0, "rewards/rejected": 5.165411949157715, "step": 3531 }, { "epoch": 0.78, "learning_rate": 6.957211395080787e-06, "logits/chosen": -1.6070032119750977, "logits/rejected": -1.6070032119750977, "logps/chosen": -70.79489135742188, "logps/rejected": -70.79489135742188, "loss": 0.522, "rewards/accuracies": 0.0, "rewards/chosen": 6.377685546875, "rewards/margins": 0.0, "rewards/rejected": 6.377685546875, "step": 3532 }, { "epoch": 0.78, "learning_rate": 6.955561964296911e-06, "logits/chosen": -1.6217604875564575, "logits/rejected": -1.5274736881256104, "logps/chosen": -71.54605865478516, "logps/rejected": -120.7469482421875, "loss": 1.0587, "rewards/accuracies": 0.0, "rewards/chosen": 6.854708194732666, "rewards/margins": -1.0739569664001465, "rewards/rejected": 7.9286651611328125, "step": 3533 }, { "epoch": 0.78, "learning_rate": 6.9539122822280246e-06, "logits/chosen": -1.5244907140731812, "logits/rejected": -1.5147813558578491, "logps/chosen": -83.40241241455078, "logps/rejected": -41.20210647583008, "loss": 0.2629, "rewards/accuracies": 1.0, "rewards/chosen": 2.9929306507110596, "rewards/margins": 0.9482097625732422, "rewards/rejected": 2.0447208881378174, "step": 3534 }, { "epoch": 0.78, "learning_rate": 6.952262349086108e-06, "logits/chosen": -1.5573577880859375, "logits/rejected": -1.4228005409240723, "logps/chosen": -57.22901153564453, "logps/rejected": -65.64313507080078, "loss": 0.8569, "rewards/accuracies": 1.0, "rewards/chosen": 5.788609504699707, "rewards/margins": 0.6191534996032715, "rewards/rejected": 5.1694560050964355, "step": 3535 }, { "epoch": 0.78, "learning_rate": 6.95061216508317e-06, "logits/chosen": -1.453398585319519, "logits/rejected": -1.453490972518921, "logps/chosen": -60.39485168457031, "logps/rejected": -53.35127639770508, "loss": 1.9218, "rewards/accuracies": 0.0, "rewards/chosen": 2.9459869861602783, "rewards/margins": -0.6693553924560547, "rewards/rejected": 3.615342378616333, "step": 3536 }, { "epoch": 0.78, "learning_rate": 6.94896173043126e-06, "logits/chosen": -1.8793001174926758, "logits/rejected": -1.836976170539856, "logps/chosen": -84.02873229980469, "logps/rejected": -60.691741943359375, "loss": 0.8437, "rewards/accuracies": 0.0, "rewards/chosen": 5.299681186676025, "rewards/margins": -0.797609806060791, "rewards/rejected": 6.097290992736816, "step": 3537 }, { "epoch": 0.78, "learning_rate": 6.947311045342451e-06, "logits/chosen": -1.8113231658935547, "logits/rejected": -1.8712128400802612, "logps/chosen": -108.62062072753906, "logps/rejected": -146.4214324951172, "loss": 2.6988, "rewards/accuracies": 0.0, "rewards/chosen": 2.1205062866210938, "rewards/margins": -5.381721496582031, "rewards/rejected": 7.502227783203125, "step": 3538 }, { "epoch": 0.78, "learning_rate": 6.945660110028856e-06, "logits/chosen": -1.7804821729660034, "logits/rejected": -1.8180842399597168, "logps/chosen": -81.1287841796875, "logps/rejected": -71.20091247558594, "loss": 1.8483, "rewards/accuracies": 0.0, "rewards/chosen": 2.9947030544281006, "rewards/margins": -2.952512502670288, "rewards/rejected": 5.947215557098389, "step": 3539 }, { "epoch": 0.78, "learning_rate": 6.9440089247026135e-06, "logits/chosen": -1.0472723245620728, "logits/rejected": -1.0293967723846436, "logps/chosen": -6.034990310668945, "logps/rejected": -8.69862174987793, "loss": 0.5799, "rewards/accuracies": 0.0, "rewards/chosen": 0.6058891415596008, "rewards/margins": -0.46914583444595337, "rewards/rejected": 1.0750349760055542, "step": 3540 }, { "epoch": 0.78, "learning_rate": 6.942357489575896e-06, "logits/chosen": -1.6452467441558838, "logits/rejected": -1.6506308317184448, "logps/chosen": -129.33738708496094, "logps/rejected": -62.82537841796875, "loss": 0.1535, "rewards/accuracies": 1.0, "rewards/chosen": 5.799699306488037, "rewards/margins": 3.99643611907959, "rewards/rejected": 1.8032630681991577, "step": 3541 }, { "epoch": 0.78, "learning_rate": 6.940705804860912e-06, "logits/chosen": -1.9457204341888428, "logits/rejected": -1.8905456066131592, "logps/chosen": -72.74649810791016, "logps/rejected": -53.98703384399414, "loss": 0.8275, "rewards/accuracies": 1.0, "rewards/chosen": 6.453897953033447, "rewards/margins": 2.7066707611083984, "rewards/rejected": 3.747227191925049, "step": 3542 }, { "epoch": 0.78, "learning_rate": 6.939053870769897e-06, "logits/chosen": -1.7193025350570679, "logits/rejected": -1.5859122276306152, "logps/chosen": -50.58108901977539, "logps/rejected": -15.416123390197754, "loss": 0.325, "rewards/accuracies": 1.0, "rewards/chosen": 4.889158248901367, "rewards/margins": 4.17781925201416, "rewards/rejected": 0.7113391160964966, "step": 3543 }, { "epoch": 0.78, "learning_rate": 6.93740168751512e-06, "logits/chosen": -1.4555691480636597, "logits/rejected": -1.4396344423294067, "logps/chosen": -67.7056884765625, "logps/rejected": -89.71353912353516, "loss": 0.0874, "rewards/accuracies": 1.0, "rewards/chosen": 7.219333171844482, "rewards/margins": 1.8170967102050781, "rewards/rejected": 5.402236461639404, "step": 3544 }, { "epoch": 0.78, "learning_rate": 6.935749255308885e-06, "logits/chosen": -1.6382218599319458, "logits/rejected": -1.548032522201538, "logps/chosen": -33.5057373046875, "logps/rejected": -35.50654983520508, "loss": 0.8208, "rewards/accuracies": 1.0, "rewards/chosen": 2.780458927154541, "rewards/margins": 2.0971386432647705, "rewards/rejected": 0.6833202242851257, "step": 3545 }, { "epoch": 0.78, "learning_rate": 6.9340965743635236e-06, "logits/chosen": -1.5917521715164185, "logits/rejected": -1.5158947706222534, "logps/chosen": -77.58341979980469, "logps/rejected": -56.64360046386719, "loss": 0.3541, "rewards/accuracies": 1.0, "rewards/chosen": 3.063674211502075, "rewards/margins": 1.1861854791641235, "rewards/rejected": 1.8774887323379517, "step": 3546 }, { "epoch": 0.79, "learning_rate": 6.932443644891402e-06, "logits/chosen": -1.582032561302185, "logits/rejected": -1.6334573030471802, "logps/chosen": -139.25164794921875, "logps/rejected": -93.23220825195312, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 6.1272125244140625, "rewards/margins": 3.0987319946289062, "rewards/rejected": 3.0284805297851562, "step": 3547 }, { "epoch": 0.79, "learning_rate": 6.930790467104916e-06, "logits/chosen": -1.475692629814148, "logits/rejected": -1.431187391281128, "logps/chosen": -90.95357513427734, "logps/rejected": -192.3856201171875, "loss": 1.0923, "rewards/accuracies": 0.0, "rewards/chosen": 7.006139278411865, "rewards/margins": -1.5498576164245605, "rewards/rejected": 8.555996894836426, "step": 3548 }, { "epoch": 0.79, "learning_rate": 6.9291370412164985e-06, "logits/chosen": -1.7087243795394897, "logits/rejected": -1.7273484468460083, "logps/chosen": -47.173770904541016, "logps/rejected": -41.91918182373047, "loss": 0.25, "rewards/accuracies": 1.0, "rewards/chosen": 2.9711415767669678, "rewards/margins": 0.5727834701538086, "rewards/rejected": 2.398358106613159, "step": 3549 }, { "epoch": 0.79, "learning_rate": 6.927483367438608e-06, "logits/chosen": -1.4788525104522705, "logits/rejected": -1.4405391216278076, "logps/chosen": -16.15131378173828, "logps/rejected": -18.34062957763672, "loss": 0.6682, "rewards/accuracies": 0.0, "rewards/chosen": 1.5736020803451538, "rewards/margins": -0.8009737730026245, "rewards/rejected": 2.3745758533477783, "step": 3550 }, { "epoch": 0.79, "learning_rate": 6.925829445983738e-06, "logits/chosen": -1.7090144157409668, "logits/rejected": -1.6953027248382568, "logps/chosen": -35.8392333984375, "logps/rejected": -38.80596923828125, "loss": 1.2904, "rewards/accuracies": 0.0, "rewards/chosen": 2.975569248199463, "rewards/margins": -2.446488380432129, "rewards/rejected": 5.422057628631592, "step": 3551 }, { "epoch": 0.79, "learning_rate": 6.924175277064414e-06, "logits/chosen": -1.4417471885681152, "logits/rejected": -1.3867230415344238, "logps/chosen": -53.90314483642578, "logps/rejected": -64.7536392211914, "loss": 0.4996, "rewards/accuracies": 0.0, "rewards/chosen": 2.6855247020721436, "rewards/margins": -0.0028488636016845703, "rewards/rejected": 2.688373565673828, "step": 3552 }, { "epoch": 0.79, "learning_rate": 6.922520860893193e-06, "logits/chosen": -1.7298088073730469, "logits/rejected": -1.6869547367095947, "logps/chosen": -78.92518615722656, "logps/rejected": -79.07731628417969, "loss": 1.5692, "rewards/accuracies": 0.0, "rewards/chosen": 3.174046277999878, "rewards/margins": -1.9514377117156982, "rewards/rejected": 5.125483989715576, "step": 3553 }, { "epoch": 0.79, "learning_rate": 6.920866197682662e-06, "logits/chosen": -1.7264090776443481, "logits/rejected": -1.7476102113723755, "logps/chosen": -143.07723999023438, "logps/rejected": -191.27735900878906, "loss": 0.8973, "rewards/accuracies": 0.0, "rewards/chosen": 7.349494934082031, "rewards/margins": -1.6047391891479492, "rewards/rejected": 8.95423412322998, "step": 3554 }, { "epoch": 0.79, "learning_rate": 6.919211287645442e-06, "logits/chosen": -1.645716905593872, "logits/rejected": -1.4739303588867188, "logps/chosen": -60.73570251464844, "logps/rejected": -22.006179809570312, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": 4.640662670135498, "rewards/margins": 3.98982834815979, "rewards/rejected": 0.6508342623710632, "step": 3555 }, { "epoch": 0.79, "learning_rate": 6.917556130994187e-06, "logits/chosen": -1.781413197517395, "logits/rejected": -1.78561270236969, "logps/chosen": -19.079294204711914, "logps/rejected": -45.470863342285156, "loss": 2.7048, "rewards/accuracies": 0.0, "rewards/chosen": 1.3047910928726196, "rewards/margins": -2.2962069511413574, "rewards/rejected": 3.6009979248046875, "step": 3556 }, { "epoch": 0.79, "learning_rate": 6.915900727941577e-06, "logits/chosen": -1.6917996406555176, "logits/rejected": -1.6557466983795166, "logps/chosen": -75.73683166503906, "logps/rejected": -42.263877868652344, "loss": 2.7055, "rewards/accuracies": 0.0, "rewards/chosen": 1.5147552490234375, "rewards/margins": -1.071868896484375, "rewards/rejected": 2.5866241455078125, "step": 3557 }, { "epoch": 0.79, "learning_rate": 6.91424507870033e-06, "logits/chosen": -1.68305504322052, "logits/rejected": -1.6679962873458862, "logps/chosen": -118.10337829589844, "logps/rejected": -68.89817810058594, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": 8.773323059082031, "rewards/margins": 3.3901638984680176, "rewards/rejected": 5.383159160614014, "step": 3558 }, { "epoch": 0.79, "learning_rate": 6.9125891834831916e-06, "logits/chosen": -1.701095700263977, "logits/rejected": -1.5772669315338135, "logps/chosen": -136.114501953125, "logps/rejected": -102.29347229003906, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 7.463803291320801, "rewards/margins": 4.631669998168945, "rewards/rejected": 2.8321335315704346, "step": 3559 }, { "epoch": 0.79, "learning_rate": 6.910933042502943e-06, "logits/chosen": -1.657832384109497, "logits/rejected": -1.7110718488693237, "logps/chosen": -38.500709533691406, "logps/rejected": -55.90618896484375, "loss": 2.7115, "rewards/accuracies": 0.0, "rewards/chosen": 3.6661460399627686, "rewards/margins": -0.7151925563812256, "rewards/rejected": 4.381338596343994, "step": 3560 }, { "epoch": 0.79, "learning_rate": 6.90927665597239e-06, "logits/chosen": -1.245309829711914, "logits/rejected": -1.1418774127960205, "logps/chosen": -26.774234771728516, "logps/rejected": -13.542387008666992, "loss": 0.3359, "rewards/accuracies": 1.0, "rewards/chosen": 2.4979000091552734, "rewards/margins": 0.48097825050354004, "rewards/rejected": 2.0169217586517334, "step": 3561 }, { "epoch": 0.79, "learning_rate": 6.907620024104377e-06, "logits/chosen": -1.7470335960388184, "logits/rejected": -1.7184746265411377, "logps/chosen": -141.9943084716797, "logps/rejected": -115.97830200195312, "loss": 1.1579, "rewards/accuracies": 1.0, "rewards/chosen": 8.399803161621094, "rewards/margins": 3.112204074859619, "rewards/rejected": 5.287599086761475, "step": 3562 }, { "epoch": 0.79, "learning_rate": 6.905963147111776e-06, "logits/chosen": -1.2756931781768799, "logits/rejected": -1.241679310798645, "logps/chosen": -78.090087890625, "logps/rejected": -54.224449157714844, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": 5.876373291015625, "rewards/margins": 2.7350990772247314, "rewards/rejected": 3.1412742137908936, "step": 3563 }, { "epoch": 0.79, "learning_rate": 6.904306025207492e-06, "logits/chosen": -1.7025489807128906, "logits/rejected": -1.7342965602874756, "logps/chosen": -82.28022766113281, "logps/rejected": -71.1502456665039, "loss": 0.9249, "rewards/accuracies": 0.0, "rewards/chosen": 4.768861293792725, "rewards/margins": -0.7622013092041016, "rewards/rejected": 5.531062602996826, "step": 3564 }, { "epoch": 0.79, "learning_rate": 6.902648658604463e-06, "logits/chosen": -1.5360403060913086, "logits/rejected": -1.296722412109375, "logps/chosen": -46.81524658203125, "logps/rejected": -89.66593933105469, "loss": 2.0254, "rewards/accuracies": 0.0, "rewards/chosen": 2.719242811203003, "rewards/margins": -4.030176162719727, "rewards/rejected": 6.74941873550415, "step": 3565 }, { "epoch": 0.79, "learning_rate": 6.900991047515655e-06, "logits/chosen": -1.5690280199050903, "logits/rejected": -1.586296558380127, "logps/chosen": -54.605926513671875, "logps/rejected": -49.20749282836914, "loss": 1.6287, "rewards/accuracies": 0.0, "rewards/chosen": 3.0843796730041504, "rewards/margins": -0.06644940376281738, "rewards/rejected": 3.1508290767669678, "step": 3566 }, { "epoch": 0.79, "learning_rate": 6.899333192154067e-06, "logits/chosen": -1.7468069791793823, "logits/rejected": -1.715192198753357, "logps/chosen": -61.78605270385742, "logps/rejected": -57.17416763305664, "loss": 0.1053, "rewards/accuracies": 1.0, "rewards/chosen": 3.9720051288604736, "rewards/margins": 1.4971213340759277, "rewards/rejected": 2.474883794784546, "step": 3567 }, { "epoch": 0.79, "learning_rate": 6.89767509273273e-06, "logits/chosen": -1.8906779289245605, "logits/rejected": -1.859822154045105, "logps/chosen": -52.29932403564453, "logps/rejected": -49.67552185058594, "loss": 1.0838, "rewards/accuracies": 0.0, "rewards/chosen": 1.6791954040527344, "rewards/margins": -1.2146072387695312, "rewards/rejected": 2.8938026428222656, "step": 3568 }, { "epoch": 0.79, "learning_rate": 6.896016749464705e-06, "logits/chosen": -1.7390351295471191, "logits/rejected": -1.6756329536437988, "logps/chosen": -45.590335845947266, "logps/rejected": -49.96198272705078, "loss": 0.6883, "rewards/accuracies": 0.0, "rewards/chosen": 2.1476924419403076, "rewards/margins": -0.9829869270324707, "rewards/rejected": 3.1306793689727783, "step": 3569 }, { "epoch": 0.79, "learning_rate": 6.894358162563086e-06, "logits/chosen": -1.7270803451538086, "logits/rejected": -1.644561767578125, "logps/chosen": -101.94857788085938, "logps/rejected": -59.79972839355469, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 6.212683200836182, "rewards/margins": 3.5597596168518066, "rewards/rejected": 2.652923583984375, "step": 3570 }, { "epoch": 0.79, "learning_rate": 6.892699332240998e-06, "logits/chosen": -1.7827692031860352, "logits/rejected": -1.669095516204834, "logps/chosen": -60.90595245361328, "logps/rejected": -16.32880401611328, "loss": 0.2405, "rewards/accuracies": 1.0, "rewards/chosen": 1.3304672241210938, "rewards/margins": 0.5377336144447327, "rewards/rejected": 0.7927336096763611, "step": 3571 }, { "epoch": 0.79, "learning_rate": 6.891040258711593e-06, "logits/chosen": -1.9456607103347778, "logits/rejected": -1.894011378288269, "logps/chosen": -110.89033508300781, "logps/rejected": -59.34152603149414, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 8.192573547363281, "rewards/margins": 4.943885803222656, "rewards/rejected": 3.248687505722046, "step": 3572 }, { "epoch": 0.79, "learning_rate": 6.889380942188062e-06, "logits/chosen": -1.3994736671447754, "logits/rejected": -1.2929368019104004, "logps/chosen": -38.10240173339844, "logps/rejected": -29.97629165649414, "loss": 0.0831, "rewards/accuracies": 1.0, "rewards/chosen": 3.582714796066284, "rewards/margins": 1.7146499156951904, "rewards/rejected": 1.8680648803710938, "step": 3573 }, { "epoch": 0.79, "learning_rate": 6.887721382883623e-06, "logits/chosen": -1.591439962387085, "logits/rejected": -1.575658917427063, "logps/chosen": -56.729461669921875, "logps/rejected": -53.99092102050781, "loss": 1.7367, "rewards/accuracies": 0.0, "rewards/chosen": 2.6273324489593506, "rewards/margins": -0.48264217376708984, "rewards/rejected": 3.1099746227264404, "step": 3574 }, { "epoch": 0.79, "learning_rate": 6.8860615810115225e-06, "logits/chosen": -1.931922435760498, "logits/rejected": -1.8815044164657593, "logps/chosen": -112.47578430175781, "logps/rejected": -56.20330810546875, "loss": 0.2135, "rewards/accuracies": 1.0, "rewards/chosen": 9.41510009765625, "rewards/margins": 5.338059902191162, "rewards/rejected": 4.077040195465088, "step": 3575 }, { "epoch": 0.79, "learning_rate": 6.884401536785045e-06, "logits/chosen": -1.7612805366516113, "logits/rejected": -1.7676780223846436, "logps/chosen": -45.74094772338867, "logps/rejected": -63.783111572265625, "loss": 1.2077, "rewards/accuracies": 0.0, "rewards/chosen": 2.423344850540161, "rewards/margins": -0.45653486251831055, "rewards/rejected": 2.8798797130584717, "step": 3576 }, { "epoch": 0.79, "learning_rate": 6.882741250417498e-06, "logits/chosen": -1.5308113098144531, "logits/rejected": -1.5665688514709473, "logps/chosen": -98.86154174804688, "logps/rejected": -109.26873016357422, "loss": 1.9304, "rewards/accuracies": 0.0, "rewards/chosen": 2.3206353187561035, "rewards/margins": -3.8323044776916504, "rewards/rejected": 6.152939796447754, "step": 3577 }, { "epoch": 0.79, "learning_rate": 6.881080722122226e-06, "logits/chosen": -1.6810404062271118, "logits/rejected": -1.6862766742706299, "logps/chosen": -71.16381072998047, "logps/rejected": -89.13507080078125, "loss": 1.9871, "rewards/accuracies": 0.0, "rewards/chosen": 3.072216749191284, "rewards/margins": -3.3866851329803467, "rewards/rejected": 6.458901882171631, "step": 3578 }, { "epoch": 0.79, "learning_rate": 6.879419952112606e-06, "logits/chosen": -1.5631049871444702, "logits/rejected": -1.398957371711731, "logps/chosen": -157.5936279296875, "logps/rejected": -97.86259460449219, "loss": 0.6504, "rewards/accuracies": 1.0, "rewards/chosen": 7.475583076477051, "rewards/margins": 2.94622802734375, "rewards/rejected": 4.529355049133301, "step": 3579 }, { "epoch": 0.79, "learning_rate": 6.877758940602038e-06, "logits/chosen": -1.3163634538650513, "logits/rejected": -1.2071561813354492, "logps/chosen": -58.05021286010742, "logps/rejected": -81.88179016113281, "loss": 2.4393, "rewards/accuracies": 0.0, "rewards/chosen": 1.4444416761398315, "rewards/margins": -4.405270099639893, "rewards/rejected": 5.849711894989014, "step": 3580 }, { "epoch": 0.79, "learning_rate": 6.8760976878039595e-06, "logits/chosen": -1.8472874164581299, "logits/rejected": -1.9406794309616089, "logps/chosen": -127.8842544555664, "logps/rejected": -134.3983154296875, "loss": 0.8639, "rewards/accuracies": 0.0, "rewards/chosen": 7.91283655166626, "rewards/margins": -1.4644742012023926, "rewards/rejected": 9.377310752868652, "step": 3581 }, { "epoch": 0.79, "learning_rate": 6.87443619393184e-06, "logits/chosen": -1.591127872467041, "logits/rejected": -1.4585305452346802, "logps/chosen": -70.21318817138672, "logps/rejected": -14.863059043884277, "loss": 0.5155, "rewards/accuracies": 1.0, "rewards/chosen": 6.019130706787109, "rewards/margins": 4.309511184692383, "rewards/rejected": 1.7096195220947266, "step": 3582 }, { "epoch": 0.79, "learning_rate": 6.872774459199174e-06, "logits/chosen": -1.7772165536880493, "logits/rejected": -1.6971073150634766, "logps/chosen": -88.88819885253906, "logps/rejected": -55.149776458740234, "loss": 0.3409, "rewards/accuracies": 1.0, "rewards/chosen": 4.89373779296875, "rewards/margins": 1.785503625869751, "rewards/rejected": 3.108234167098999, "step": 3583 }, { "epoch": 0.79, "learning_rate": 6.871112483819493e-06, "logits/chosen": -1.461432933807373, "logits/rejected": -1.5132226943969727, "logps/chosen": -72.55276489257812, "logps/rejected": -141.48867797851562, "loss": 1.6659, "rewards/accuracies": 0.0, "rewards/chosen": 4.784019470214844, "rewards/margins": -3.2899770736694336, "rewards/rejected": 8.073996543884277, "step": 3584 }, { "epoch": 0.79, "learning_rate": 6.869450268006357e-06, "logits/chosen": -1.3256601095199585, "logits/rejected": -1.3256601095199585, "logps/chosen": -134.6917266845703, "logps/rejected": -134.6917266845703, "loss": 1.2011, "rewards/accuracies": 0.0, "rewards/chosen": 5.91024923324585, "rewards/margins": 0.0, "rewards/rejected": 5.91024923324585, "step": 3585 }, { "epoch": 0.79, "learning_rate": 6.867787811973353e-06, "logits/chosen": -1.7142398357391357, "logits/rejected": -1.7142398357391357, "logps/chosen": -79.2033920288086, "logps/rejected": -79.2033920288086, "loss": 0.3745, "rewards/accuracies": 0.0, "rewards/chosen": 3.175201416015625, "rewards/margins": 0.0, "rewards/rejected": 3.175201416015625, "step": 3586 }, { "epoch": 0.79, "learning_rate": 6.866125115934106e-06, "logits/chosen": -1.7004354000091553, "logits/rejected": -1.6428042650222778, "logps/chosen": -62.58704376220703, "logps/rejected": -48.64406967163086, "loss": 0.508, "rewards/accuracies": 1.0, "rewards/chosen": 3.2906854152679443, "rewards/margins": 0.446094274520874, "rewards/rejected": 2.8445911407470703, "step": 3587 }, { "epoch": 0.79, "learning_rate": 6.864462180102268e-06, "logits/chosen": -1.3461741209030151, "logits/rejected": -1.3461741209030151, "logps/chosen": -40.72772216796875, "logps/rejected": -40.72772216796875, "loss": 0.5239, "rewards/accuracies": 0.0, "rewards/chosen": 3.1735947132110596, "rewards/margins": 0.0, "rewards/rejected": 3.1735947132110596, "step": 3588 }, { "epoch": 0.79, "learning_rate": 6.862799004691522e-06, "logits/chosen": -1.8547468185424805, "logits/rejected": -1.8087445497512817, "logps/chosen": -78.22966003417969, "logps/rejected": -50.57154083251953, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 7.17715311050415, "rewards/margins": 3.1044836044311523, "rewards/rejected": 4.072669506072998, "step": 3589 }, { "epoch": 0.79, "learning_rate": 6.861135589915583e-06, "logits/chosen": -1.763211965560913, "logits/rejected": -1.7883563041687012, "logps/chosen": -92.86489868164062, "logps/rejected": -131.66209411621094, "loss": 0.2185, "rewards/accuracies": 1.0, "rewards/chosen": 8.781989097595215, "rewards/margins": 0.8752613067626953, "rewards/rejected": 7.9067277908325195, "step": 3590 }, { "epoch": 0.79, "learning_rate": 6.859471935988193e-06, "logits/chosen": -1.3767491579055786, "logits/rejected": -1.324366807937622, "logps/chosen": -104.53142547607422, "logps/rejected": -73.06109619140625, "loss": 1.5078, "rewards/accuracies": 0.0, "rewards/chosen": 4.49151086807251, "rewards/margins": -0.8903913497924805, "rewards/rejected": 5.38190221786499, "step": 3591 }, { "epoch": 0.8, "learning_rate": 6.85780804312313e-06, "logits/chosen": -1.814255714416504, "logits/rejected": -1.7208105325698853, "logps/chosen": -54.800071716308594, "logps/rejected": -27.329917907714844, "loss": 0.0898, "rewards/accuracies": 1.0, "rewards/chosen": 3.710409641265869, "rewards/margins": 2.915098190307617, "rewards/rejected": 0.7953113913536072, "step": 3592 }, { "epoch": 0.8, "learning_rate": 6.8561439115342e-06, "logits/chosen": -1.2854280471801758, "logits/rejected": -1.2458603382110596, "logps/chosen": -34.74812316894531, "logps/rejected": -47.72114181518555, "loss": 0.1744, "rewards/accuracies": 1.0, "rewards/chosen": 3.702613115310669, "rewards/margins": 0.8841671943664551, "rewards/rejected": 2.818445920944214, "step": 3593 }, { "epoch": 0.8, "learning_rate": 6.854479541435238e-06, "logits/chosen": -1.6576207876205444, "logits/rejected": -1.610505223274231, "logps/chosen": -43.71037292480469, "logps/rejected": -3.4710376262664795, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 4.848692417144775, "rewards/margins": 3.920839786529541, "rewards/rejected": 0.9278527498245239, "step": 3594 }, { "epoch": 0.8, "learning_rate": 6.852814933040114e-06, "logits/chosen": -1.7622512578964233, "logits/rejected": -1.7373775243759155, "logps/chosen": -83.83708190917969, "logps/rejected": -84.92184448242188, "loss": 2.5767, "rewards/accuracies": 0.0, "rewards/chosen": 6.1509599685668945, "rewards/margins": -1.040452480316162, "rewards/rejected": 7.191412448883057, "step": 3595 }, { "epoch": 0.8, "learning_rate": 6.851150086562725e-06, "logits/chosen": -1.5566664934158325, "logits/rejected": -1.5322141647338867, "logps/chosen": -57.17142868041992, "logps/rejected": -101.37197875976562, "loss": 0.1456, "rewards/accuracies": 1.0, "rewards/chosen": 3.0169620513916016, "rewards/margins": 1.0891101360321045, "rewards/rejected": 1.927851915359497, "step": 3596 }, { "epoch": 0.8, "learning_rate": 6.849485002216999e-06, "logits/chosen": -1.8214210271835327, "logits/rejected": -1.8118082284927368, "logps/chosen": -25.29958152770996, "logps/rejected": -33.490909576416016, "loss": 0.9773, "rewards/accuracies": 0.0, "rewards/chosen": 2.434603691101074, "rewards/margins": -0.4761362075805664, "rewards/rejected": 2.9107398986816406, "step": 3597 }, { "epoch": 0.8, "learning_rate": 6.847819680216897e-06, "logits/chosen": -1.4743858575820923, "logits/rejected": -1.4144022464752197, "logps/chosen": -110.22039031982422, "logps/rejected": -64.16230773925781, "loss": 0.4896, "rewards/accuracies": 1.0, "rewards/chosen": 3.7440803050994873, "rewards/margins": 0.5134572982788086, "rewards/rejected": 3.2306230068206787, "step": 3598 }, { "epoch": 0.8, "learning_rate": 6.846154120776408e-06, "logits/chosen": -1.7178294658660889, "logits/rejected": -1.4788126945495605, "logps/chosen": -50.29996871948242, "logps/rejected": -135.78436279296875, "loss": 3.4685, "rewards/accuracies": 0.0, "rewards/chosen": 3.3548038005828857, "rewards/margins": -3.193697690963745, "rewards/rejected": 6.548501491546631, "step": 3599 }, { "epoch": 0.8, "learning_rate": 6.844488324109554e-06, "logits/chosen": -1.8191556930541992, "logits/rejected": -1.7891007661819458, "logps/chosen": -103.5753173828125, "logps/rejected": -64.56654357910156, "loss": 0.057, "rewards/accuracies": 1.0, "rewards/chosen": 7.327301025390625, "rewards/margins": 4.508194923400879, "rewards/rejected": 2.819106340408325, "step": 3600 }, { "epoch": 0.8, "learning_rate": 6.842822290430382e-06, "logits/chosen": -1.9635792970657349, "logits/rejected": -1.964198350906372, "logps/chosen": -34.66995620727539, "logps/rejected": -44.07060241699219, "loss": 0.9201, "rewards/accuracies": 0.0, "rewards/chosen": 2.9803683757781982, "rewards/margins": -0.2849576473236084, "rewards/rejected": 3.2653260231018066, "step": 3601 }, { "epoch": 0.8, "learning_rate": 6.841156019952978e-06, "logits/chosen": -1.6950057744979858, "logits/rejected": -1.6982330083847046, "logps/chosen": -70.84480285644531, "logps/rejected": -68.23390197753906, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": 6.169134616851807, "rewards/margins": 2.323754072189331, "rewards/rejected": 3.8453805446624756, "step": 3602 }, { "epoch": 0.8, "learning_rate": 6.839489512891451e-06, "logits/chosen": -1.3848752975463867, "logits/rejected": -1.3848752975463867, "logps/chosen": -34.01686477661133, "logps/rejected": -34.01686477661133, "loss": 0.3867, "rewards/accuracies": 0.0, "rewards/chosen": 2.1534860134124756, "rewards/margins": 0.0, "rewards/rejected": 2.1534860134124756, "step": 3603 }, { "epoch": 0.8, "learning_rate": 6.837822769459942e-06, "logits/chosen": -1.5212059020996094, "logits/rejected": -1.5040128231048584, "logps/chosen": -62.64330291748047, "logps/rejected": -41.13166427612305, "loss": 0.2922, "rewards/accuracies": 1.0, "rewards/chosen": 4.039501190185547, "rewards/margins": 0.27648496627807617, "rewards/rejected": 3.7630162239074707, "step": 3604 }, { "epoch": 0.8, "learning_rate": 6.836155789872626e-06, "logits/chosen": -1.4023758172988892, "logits/rejected": -1.4023758172988892, "logps/chosen": -61.72320556640625, "logps/rejected": -61.72320556640625, "loss": 0.5167, "rewards/accuracies": 0.0, "rewards/chosen": 3.6323013305664062, "rewards/margins": 0.0, "rewards/rejected": 3.6323013305664062, "step": 3605 }, { "epoch": 0.8, "learning_rate": 6.8344885743437054e-06, "logits/chosen": -1.4597262144088745, "logits/rejected": -1.3191739320755005, "logps/chosen": -116.32821655273438, "logps/rejected": -36.62910461425781, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": 7.600293159484863, "rewards/margins": 5.223237991333008, "rewards/rejected": 2.3770554065704346, "step": 3606 }, { "epoch": 0.8, "learning_rate": 6.832821123087412e-06, "logits/chosen": -1.8862463235855103, "logits/rejected": -1.791168451309204, "logps/chosen": -88.65879821777344, "logps/rejected": -16.774728775024414, "loss": 1.0926, "rewards/accuracies": 1.0, "rewards/chosen": 2.2451820373535156, "rewards/margins": 1.508249044418335, "rewards/rejected": 0.7369329333305359, "step": 3607 }, { "epoch": 0.8, "learning_rate": 6.83115343631801e-06, "logits/chosen": -1.4262347221374512, "logits/rejected": -1.4262347221374512, "logps/chosen": -28.223899841308594, "logps/rejected": -28.223899841308594, "loss": 4.4786, "rewards/accuracies": 0.0, "rewards/chosen": 3.4996914863586426, "rewards/margins": 0.0, "rewards/rejected": 3.4996914863586426, "step": 3608 }, { "epoch": 0.8, "learning_rate": 6.829485514249795e-06, "logits/chosen": -1.4463911056518555, "logits/rejected": -1.2401065826416016, "logps/chosen": -88.17570495605469, "logps/rejected": -51.89649200439453, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": 7.188106060028076, "rewards/margins": 4.823676586151123, "rewards/rejected": 2.364429473876953, "step": 3609 }, { "epoch": 0.8, "learning_rate": 6.82781735709709e-06, "logits/chosen": -1.788759708404541, "logits/rejected": -1.787831425666809, "logps/chosen": -99.01860046386719, "logps/rejected": -74.99073028564453, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": 6.673068523406982, "rewards/margins": 3.1771326065063477, "rewards/rejected": 3.4959359169006348, "step": 3610 }, { "epoch": 0.8, "learning_rate": 6.826148965074246e-06, "logits/chosen": -1.3986152410507202, "logits/rejected": -1.3237762451171875, "logps/chosen": -46.80837631225586, "logps/rejected": -59.095176696777344, "loss": 1.1374, "rewards/accuracies": 1.0, "rewards/chosen": 2.9496219158172607, "rewards/margins": 0.6190309524536133, "rewards/rejected": 2.3305909633636475, "step": 3611 }, { "epoch": 0.8, "learning_rate": 6.824480338395652e-06, "logits/chosen": -1.7543950080871582, "logits/rejected": -1.7373130321502686, "logps/chosen": -95.53697204589844, "logps/rejected": -87.12565612792969, "loss": 2.4199, "rewards/accuracies": 0.0, "rewards/chosen": 4.4923996925354, "rewards/margins": -4.808285236358643, "rewards/rejected": 9.300684928894043, "step": 3612 }, { "epoch": 0.8, "learning_rate": 6.8228114772757195e-06, "logits/chosen": -1.5636745691299438, "logits/rejected": -1.5567680597305298, "logps/chosen": -70.7598876953125, "logps/rejected": -58.783592224121094, "loss": 0.5893, "rewards/accuracies": 0.0, "rewards/chosen": 3.5500221252441406, "rewards/margins": -0.6577315330505371, "rewards/rejected": 4.207753658294678, "step": 3613 }, { "epoch": 0.8, "learning_rate": 6.821142381928894e-06, "logits/chosen": -1.679640531539917, "logits/rejected": -1.7227146625518799, "logps/chosen": -121.56745910644531, "logps/rejected": -215.99795532226562, "loss": 1.039, "rewards/accuracies": 0.0, "rewards/chosen": 6.923530578613281, "rewards/margins": -1.5749130249023438, "rewards/rejected": 8.498443603515625, "step": 3614 }, { "epoch": 0.8, "learning_rate": 6.81947305256965e-06, "logits/chosen": -1.7368592023849487, "logits/rejected": -1.7277888059616089, "logps/chosen": -112.75365447998047, "logps/rejected": -50.05897903442383, "loss": 1.5277, "rewards/accuracies": 0.0, "rewards/chosen": 1.5368874073028564, "rewards/margins": -3.0061190128326416, "rewards/rejected": 4.543006420135498, "step": 3615 }, { "epoch": 0.8, "learning_rate": 6.817803489412492e-06, "logits/chosen": -1.5491275787353516, "logits/rejected": -1.5931084156036377, "logps/chosen": -93.96530151367188, "logps/rejected": -103.0841293334961, "loss": 1.2651, "rewards/accuracies": 0.0, "rewards/chosen": 5.56380033493042, "rewards/margins": -1.8536138534545898, "rewards/rejected": 7.41741418838501, "step": 3616 }, { "epoch": 0.8, "learning_rate": 6.816133692671958e-06, "logits/chosen": -1.4349918365478516, "logits/rejected": -1.2612968683242798, "logps/chosen": -175.1219482421875, "logps/rejected": -178.468994140625, "loss": 0.9367, "rewards/accuracies": 0.0, "rewards/chosen": 7.955389499664307, "rewards/margins": -1.6947875022888184, "rewards/rejected": 9.650177001953125, "step": 3617 }, { "epoch": 0.8, "learning_rate": 6.814463662562609e-06, "logits/chosen": -1.8272689580917358, "logits/rejected": -1.7463281154632568, "logps/chosen": -49.68756103515625, "logps/rejected": -6.892932415008545, "loss": 1.324, "rewards/accuracies": 1.0, "rewards/chosen": 1.7894219160079956, "rewards/margins": 0.8270083069801331, "rewards/rejected": 0.9624136090278625, "step": 3618 }, { "epoch": 0.8, "learning_rate": 6.8127933992990404e-06, "logits/chosen": -1.7168112993240356, "logits/rejected": -1.5105504989624023, "logps/chosen": -130.60821533203125, "logps/rejected": -34.430362701416016, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": 5.4955291748046875, "rewards/margins": 4.310093402862549, "rewards/rejected": 1.1854358911514282, "step": 3619 }, { "epoch": 0.8, "learning_rate": 6.8111229030958795e-06, "logits/chosen": -1.7068052291870117, "logits/rejected": -1.6173163652420044, "logps/chosen": -89.01644897460938, "logps/rejected": -62.52750015258789, "loss": 0.3513, "rewards/accuracies": 1.0, "rewards/chosen": 7.5095672607421875, "rewards/margins": 3.39546537399292, "rewards/rejected": 4.114101886749268, "step": 3620 }, { "epoch": 0.8, "learning_rate": 6.809452174167779e-06, "logits/chosen": -1.596039891242981, "logits/rejected": -1.5119507312774658, "logps/chosen": -95.2867431640625, "logps/rejected": -71.33901977539062, "loss": 0.4454, "rewards/accuracies": 1.0, "rewards/chosen": 2.801892042160034, "rewards/margins": 0.7096145153045654, "rewards/rejected": 2.0922775268554688, "step": 3621 }, { "epoch": 0.8, "learning_rate": 6.807781212729423e-06, "logits/chosen": -1.3659014701843262, "logits/rejected": -1.3659014701843262, "logps/chosen": -44.65386962890625, "logps/rejected": -44.65386962890625, "loss": 0.3714, "rewards/accuracies": 0.0, "rewards/chosen": 2.7127532958984375, "rewards/margins": 0.0, "rewards/rejected": 2.7127532958984375, "step": 3622 }, { "epoch": 0.8, "learning_rate": 6.806110018995527e-06, "logits/chosen": -1.423190712928772, "logits/rejected": -1.3805043697357178, "logps/chosen": -38.277503967285156, "logps/rejected": -95.17288208007812, "loss": 1.5066, "rewards/accuracies": 1.0, "rewards/chosen": 3.4301629066467285, "rewards/margins": 2.4668633937835693, "rewards/rejected": 0.963299572467804, "step": 3623 }, { "epoch": 0.8, "learning_rate": 6.804438593180836e-06, "logits/chosen": -1.5838826894760132, "logits/rejected": -1.6223416328430176, "logps/chosen": -48.176612854003906, "logps/rejected": -70.14932250976562, "loss": 1.4758, "rewards/accuracies": 0.0, "rewards/chosen": 3.414966583251953, "rewards/margins": -0.09356999397277832, "rewards/rejected": 3.5085365772247314, "step": 3624 }, { "epoch": 0.8, "learning_rate": 6.8027669355001225e-06, "logits/chosen": -1.3523377180099487, "logits/rejected": -1.313519835472107, "logps/chosen": -54.8864631652832, "logps/rejected": -44.04491424560547, "loss": 0.8417, "rewards/accuracies": 0.0, "rewards/chosen": 1.7548855543136597, "rewards/margins": -0.525720477104187, "rewards/rejected": 2.2806060314178467, "step": 3625 }, { "epoch": 0.8, "learning_rate": 6.801095046168194e-06, "logits/chosen": -1.6055424213409424, "logits/rejected": -1.6599440574645996, "logps/chosen": -89.61712646484375, "logps/rejected": -57.5922966003418, "loss": 1.1583, "rewards/accuracies": 0.0, "rewards/chosen": 5.150184631347656, "rewards/margins": -1.8963027000427246, "rewards/rejected": 7.046487331390381, "step": 3626 }, { "epoch": 0.8, "learning_rate": 6.799422925399879e-06, "logits/chosen": -1.65315842628479, "logits/rejected": -1.4629300832748413, "logps/chosen": -109.65338134765625, "logps/rejected": -53.466064453125, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 5.615933418273926, "rewards/margins": 5.883759498596191, "rewards/rejected": -0.2678260803222656, "step": 3627 }, { "epoch": 0.8, "learning_rate": 6.797750573410045e-06, "logits/chosen": -1.3248800039291382, "logits/rejected": -1.1801629066467285, "logps/chosen": -21.11162757873535, "logps/rejected": -79.97531127929688, "loss": 2.9002, "rewards/accuracies": 0.0, "rewards/chosen": 1.3292878866195679, "rewards/margins": -5.089344501495361, "rewards/rejected": 6.418632507324219, "step": 3628 }, { "epoch": 0.8, "learning_rate": 6.796077990413583e-06, "logits/chosen": -1.9783549308776855, "logits/rejected": -1.913543701171875, "logps/chosen": -94.38417053222656, "logps/rejected": -82.21600341796875, "loss": 0.0816, "rewards/accuracies": 1.0, "rewards/chosen": 7.97340726852417, "rewards/margins": 2.689929485321045, "rewards/rejected": 5.283477783203125, "step": 3629 }, { "epoch": 0.8, "learning_rate": 6.7944051766254185e-06, "logits/chosen": -1.552193284034729, "logits/rejected": -1.552193284034729, "logps/chosen": -66.55009460449219, "logps/rejected": -66.55009460449219, "loss": 0.4329, "rewards/accuracies": 0.0, "rewards/chosen": 5.966174602508545, "rewards/margins": 0.0, "rewards/rejected": 5.966174602508545, "step": 3630 }, { "epoch": 0.8, "learning_rate": 6.792732132260501e-06, "logits/chosen": -1.6145814657211304, "logits/rejected": -1.534644365310669, "logps/chosen": -159.94139099121094, "logps/rejected": -88.30194854736328, "loss": 0.1588, "rewards/accuracies": 1.0, "rewards/chosen": 7.952317714691162, "rewards/margins": 4.936009883880615, "rewards/rejected": 3.016307830810547, "step": 3631 }, { "epoch": 0.8, "learning_rate": 6.791058857533814e-06, "logits/chosen": -1.5997891426086426, "logits/rejected": -1.6982417106628418, "logps/chosen": -41.692626953125, "logps/rejected": -154.1223907470703, "loss": 2.0701, "rewards/accuracies": 0.0, "rewards/chosen": 3.330792188644409, "rewards/margins": -3.9651687145233154, "rewards/rejected": 7.295960903167725, "step": 3632 }, { "epoch": 0.8, "learning_rate": 6.7893853526603706e-06, "logits/chosen": -1.6318204402923584, "logits/rejected": -1.61062753200531, "logps/chosen": -145.1237030029297, "logps/rejected": -120.99211120605469, "loss": 0.7696, "rewards/accuracies": 0.0, "rewards/chosen": 5.642338752746582, "rewards/margins": -1.205122470855713, "rewards/rejected": 6.847461223602295, "step": 3633 }, { "epoch": 0.8, "learning_rate": 6.7877116178552106e-06, "logits/chosen": -1.6043235063552856, "logits/rejected": -1.5999000072479248, "logps/chosen": -68.96151733398438, "logps/rejected": -70.66485595703125, "loss": 0.4079, "rewards/accuracies": 0.0, "rewards/chosen": 4.6080322265625, "rewards/margins": -0.23121261596679688, "rewards/rejected": 4.839244842529297, "step": 3634 }, { "epoch": 0.8, "learning_rate": 6.786037653333406e-06, "logits/chosen": -1.4540936946868896, "logits/rejected": -1.4623576402664185, "logps/chosen": -65.58755493164062, "logps/rejected": -78.75590515136719, "loss": 0.202, "rewards/accuracies": 1.0, "rewards/chosen": 3.579357862472534, "rewards/margins": 1.074697732925415, "rewards/rejected": 2.504660129547119, "step": 3635 }, { "epoch": 0.8, "learning_rate": 6.784363459310055e-06, "logits/chosen": -1.672121286392212, "logits/rejected": -1.6173845529556274, "logps/chosen": -47.11090087890625, "logps/rejected": -82.29129028320312, "loss": 0.3442, "rewards/accuracies": 1.0, "rewards/chosen": 4.746728420257568, "rewards/margins": 0.6892094612121582, "rewards/rejected": 4.05751895904541, "step": 3636 }, { "epoch": 0.81, "learning_rate": 6.78268903600029e-06, "logits/chosen": -2.0302882194519043, "logits/rejected": -1.8958953619003296, "logps/chosen": -30.491241455078125, "logps/rejected": -142.91403198242188, "loss": 5.2758, "rewards/accuracies": 0.0, "rewards/chosen": 2.6372170448303223, "rewards/margins": -7.995914936065674, "rewards/rejected": 10.633131980895996, "step": 3637 }, { "epoch": 0.81, "learning_rate": 6.781014383619268e-06, "logits/chosen": -1.1499097347259521, "logits/rejected": -1.1499097347259521, "logps/chosen": -20.849882125854492, "logps/rejected": -20.849882125854492, "loss": 0.5076, "rewards/accuracies": 0.0, "rewards/chosen": 1.5327612161636353, "rewards/margins": 0.0, "rewards/rejected": 1.5327612161636353, "step": 3638 }, { "epoch": 0.81, "learning_rate": 6.7793395023821795e-06, "logits/chosen": -1.8100191354751587, "logits/rejected": -1.7644370794296265, "logps/chosen": -82.34161376953125, "logps/rejected": -57.04373550415039, "loss": 0.195, "rewards/accuracies": 1.0, "rewards/chosen": 3.376922607421875, "rewards/margins": 0.7443621158599854, "rewards/rejected": 2.6325604915618896, "step": 3639 }, { "epoch": 0.81, "learning_rate": 6.777664392504243e-06, "logits/chosen": -1.6389291286468506, "logits/rejected": -1.6382321119308472, "logps/chosen": -100.37200927734375, "logps/rejected": -89.39883422851562, "loss": 0.489, "rewards/accuracies": 0.0, "rewards/chosen": 5.416914463043213, "rewards/margins": -0.20168781280517578, "rewards/rejected": 5.618602275848389, "step": 3640 }, { "epoch": 0.81, "learning_rate": 6.775989054200706e-06, "logits/chosen": -1.8374223709106445, "logits/rejected": -1.80440354347229, "logps/chosen": -81.98502349853516, "logps/rejected": -103.64169311523438, "loss": 0.6553, "rewards/accuracies": 0.0, "rewards/chosen": 8.002127647399902, "rewards/margins": -0.31057262420654297, "rewards/rejected": 8.312700271606445, "step": 3641 }, { "epoch": 0.81, "learning_rate": 6.774313487686843e-06, "logits/chosen": -1.6877367496490479, "logits/rejected": -1.513817310333252, "logps/chosen": -98.86715698242188, "logps/rejected": -64.2398910522461, "loss": 0.124, "rewards/accuracies": 1.0, "rewards/chosen": 4.724377632141113, "rewards/margins": 1.6411752700805664, "rewards/rejected": 3.083202362060547, "step": 3642 }, { "epoch": 0.81, "learning_rate": 6.772637693177965e-06, "logits/chosen": -1.7897825241088867, "logits/rejected": -1.7897825241088867, "logps/chosen": -74.86564636230469, "logps/rejected": -74.86564636230469, "loss": 0.4147, "rewards/accuracies": 0.0, "rewards/chosen": 4.790814399719238, "rewards/margins": 0.0, "rewards/rejected": 4.790814399719238, "step": 3643 }, { "epoch": 0.81, "learning_rate": 6.770961670889403e-06, "logits/chosen": -1.569451928138733, "logits/rejected": -1.5026509761810303, "logps/chosen": -149.68231201171875, "logps/rejected": -38.95472717285156, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 8.702095031738281, "rewards/margins": 5.758687496185303, "rewards/rejected": 2.9434075355529785, "step": 3644 }, { "epoch": 0.81, "learning_rate": 6.769285421036523e-06, "logits/chosen": -1.6577078104019165, "logits/rejected": -1.6299422979354858, "logps/chosen": -51.469940185546875, "logps/rejected": -24.143875122070312, "loss": 0.5037, "rewards/accuracies": 0.0, "rewards/chosen": 3.0745294094085693, "rewards/margins": -0.5373637676239014, "rewards/rejected": 3.6118931770324707, "step": 3645 }, { "epoch": 0.81, "learning_rate": 6.767608943834721e-06, "logits/chosen": -1.5019316673278809, "logits/rejected": -1.4436593055725098, "logps/chosen": -74.1430435180664, "logps/rejected": -49.86473083496094, "loss": 0.1836, "rewards/accuracies": 1.0, "rewards/chosen": 3.4014413356781006, "rewards/margins": 0.8508317470550537, "rewards/rejected": 2.550609588623047, "step": 3646 }, { "epoch": 0.81, "learning_rate": 6.765932239499418e-06, "logits/chosen": -1.472438097000122, "logits/rejected": -1.5139241218566895, "logps/chosen": -61.73189163208008, "logps/rejected": -72.3673095703125, "loss": 0.4842, "rewards/accuracies": 0.0, "rewards/chosen": 3.1568875312805176, "rewards/margins": -0.4082064628601074, "rewards/rejected": 3.565093994140625, "step": 3647 }, { "epoch": 0.81, "learning_rate": 6.764255308246067e-06, "logits/chosen": -1.6093865633010864, "logits/rejected": -1.5297945737838745, "logps/chosen": -42.274024963378906, "logps/rejected": -10.403094291687012, "loss": 0.0894, "rewards/accuracies": 1.0, "rewards/chosen": 2.906932830810547, "rewards/margins": 1.7705280780792236, "rewards/rejected": 1.1364047527313232, "step": 3648 }, { "epoch": 0.81, "learning_rate": 6.762578150290151e-06, "logits/chosen": -1.327056646347046, "logits/rejected": -1.2449226379394531, "logps/chosen": -31.58491325378418, "logps/rejected": -5.770384788513184, "loss": 0.3683, "rewards/accuracies": 1.0, "rewards/chosen": 2.871046781539917, "rewards/margins": 1.9717676639556885, "rewards/rejected": 0.8992791175842285, "step": 3649 }, { "epoch": 0.81, "learning_rate": 6.76090076584718e-06, "logits/chosen": -1.6482195854187012, "logits/rejected": -1.6437582969665527, "logps/chosen": -55.02143859863281, "logps/rejected": -66.1231689453125, "loss": 2.5432, "rewards/accuracies": 0.0, "rewards/chosen": 3.380725145339966, "rewards/margins": -3.607464551925659, "rewards/rejected": 6.988189697265625, "step": 3650 }, { "epoch": 0.81, "learning_rate": 6.759223155132693e-06, "logits/chosen": -1.5435774326324463, "logits/rejected": -1.4692654609680176, "logps/chosen": -126.53279113769531, "logps/rejected": -81.82601928710938, "loss": 0.3376, "rewards/accuracies": 1.0, "rewards/chosen": 5.908998012542725, "rewards/margins": 3.287950038909912, "rewards/rejected": 2.6210479736328125, "step": 3651 }, { "epoch": 0.81, "learning_rate": 6.7575453183622594e-06, "logits/chosen": -1.656610369682312, "logits/rejected": -1.652755856513977, "logps/chosen": -33.41880416870117, "logps/rejected": -28.431922912597656, "loss": 0.7118, "rewards/accuracies": 0.0, "rewards/chosen": 1.4046592712402344, "rewards/margins": -0.31332898139953613, "rewards/rejected": 1.7179882526397705, "step": 3652 }, { "epoch": 0.81, "learning_rate": 6.755867255751478e-06, "logits/chosen": -1.4808838367462158, "logits/rejected": -1.4144705533981323, "logps/chosen": -142.8744354248047, "logps/rejected": -161.89337158203125, "loss": 1.8589, "rewards/accuracies": 1.0, "rewards/chosen": 5.431005954742432, "rewards/margins": 1.2689409255981445, "rewards/rejected": 4.162065029144287, "step": 3653 }, { "epoch": 0.81, "learning_rate": 6.754188967515975e-06, "logits/chosen": -1.4683502912521362, "logits/rejected": -1.454548954963684, "logps/chosen": -26.836225509643555, "logps/rejected": -49.19480514526367, "loss": 0.3178, "rewards/accuracies": 1.0, "rewards/chosen": 2.291409492492676, "rewards/margins": 0.15491318702697754, "rewards/rejected": 2.1364963054656982, "step": 3654 }, { "epoch": 0.81, "learning_rate": 6.752510453871407e-06, "logits/chosen": -1.401833415031433, "logits/rejected": -1.3876440525054932, "logps/chosen": -24.73334503173828, "logps/rejected": -52.626869201660156, "loss": 1.2946, "rewards/accuracies": 0.0, "rewards/chosen": 1.503326416015625, "rewards/margins": -2.5035014152526855, "rewards/rejected": 4.0068278312683105, "step": 3655 }, { "epoch": 0.81, "learning_rate": 6.7508317150334576e-06, "logits/chosen": -1.7102104425430298, "logits/rejected": -1.6355438232421875, "logps/chosen": -132.46603393554688, "logps/rejected": -40.141502380371094, "loss": 0.3656, "rewards/accuracies": 1.0, "rewards/chosen": 6.021200656890869, "rewards/margins": 3.2491798400878906, "rewards/rejected": 2.7720208168029785, "step": 3656 }, { "epoch": 0.81, "learning_rate": 6.749152751217842e-06, "logits/chosen": -1.635741949081421, "logits/rejected": -1.6425087451934814, "logps/chosen": -95.39604187011719, "logps/rejected": -67.5013198852539, "loss": 0.6453, "rewards/accuracies": 0.0, "rewards/chosen": 2.94724440574646, "rewards/margins": -0.8358802795410156, "rewards/rejected": 3.7831246852874756, "step": 3657 }, { "epoch": 0.81, "learning_rate": 6.747473562640303e-06, "logits/chosen": -1.623295545578003, "logits/rejected": -1.283516526222229, "logps/chosen": -161.77012634277344, "logps/rejected": -73.36489868164062, "loss": 0.1143, "rewards/accuracies": 1.0, "rewards/chosen": 6.706063747406006, "rewards/margins": 1.4918813705444336, "rewards/rejected": 5.214182376861572, "step": 3658 }, { "epoch": 0.81, "learning_rate": 6.7457941495166136e-06, "logits/chosen": -1.71538507938385, "logits/rejected": -1.3941479921340942, "logps/chosen": -60.08903503417969, "logps/rejected": -207.8262939453125, "loss": 3.0876, "rewards/accuracies": 0.0, "rewards/chosen": 4.324989318847656, "rewards/margins": -6.171687126159668, "rewards/rejected": 10.496676445007324, "step": 3659 }, { "epoch": 0.81, "learning_rate": 6.744114512062571e-06, "logits/chosen": -1.4816902875900269, "logits/rejected": -1.3530341386795044, "logps/chosen": -78.71247100830078, "logps/rejected": -27.15721893310547, "loss": 1.5309, "rewards/accuracies": 1.0, "rewards/chosen": 4.629709720611572, "rewards/margins": 2.4659500122070312, "rewards/rejected": 2.163759708404541, "step": 3660 }, { "epoch": 0.81, "learning_rate": 6.7424346504940075e-06, "logits/chosen": -1.8459769487380981, "logits/rejected": -1.880115032196045, "logps/chosen": -77.42805480957031, "logps/rejected": -78.34529113769531, "loss": 1.8577, "rewards/accuracies": 0.0, "rewards/chosen": 1.6264899969100952, "rewards/margins": -3.6569833755493164, "rewards/rejected": 5.283473491668701, "step": 3661 }, { "epoch": 0.81, "learning_rate": 6.740754565026778e-06, "logits/chosen": -1.7251334190368652, "logits/rejected": -1.7140804529190063, "logps/chosen": -93.56211853027344, "logps/rejected": -50.22850799560547, "loss": 0.4635, "rewards/accuracies": 0.0, "rewards/chosen": 3.730799913406372, "rewards/margins": -0.3017570972442627, "rewards/rejected": 4.032557010650635, "step": 3662 }, { "epoch": 0.81, "learning_rate": 6.739074255876773e-06, "logits/chosen": -1.6121703386306763, "logits/rejected": -1.7098854780197144, "logps/chosen": -56.58237838745117, "logps/rejected": -78.54155731201172, "loss": 1.0878, "rewards/accuracies": 0.0, "rewards/chosen": 4.4882636070251465, "rewards/margins": -1.3809986114501953, "rewards/rejected": 5.869262218475342, "step": 3663 }, { "epoch": 0.81, "learning_rate": 6.737393723259906e-06, "logits/chosen": -1.6646912097930908, "logits/rejected": -1.6858880519866943, "logps/chosen": -111.98056030273438, "logps/rejected": -145.6120147705078, "loss": 4.2796, "rewards/accuracies": 0.0, "rewards/chosen": 6.175274848937988, "rewards/margins": -7.8655805587768555, "rewards/rejected": 14.040855407714844, "step": 3664 }, { "epoch": 0.81, "learning_rate": 6.735712967392123e-06, "logits/chosen": -1.6575372219085693, "logits/rejected": -1.4511703252792358, "logps/chosen": -122.53720092773438, "logps/rejected": -21.669147491455078, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": 6.589001655578613, "rewards/margins": 5.051094055175781, "rewards/rejected": 1.5379078388214111, "step": 3665 }, { "epoch": 0.81, "learning_rate": 6.734031988489396e-06, "logits/chosen": -1.5081284046173096, "logits/rejected": -1.294382095336914, "logps/chosen": -58.157745361328125, "logps/rejected": -15.299676895141602, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": 5.776041507720947, "rewards/margins": 4.024534225463867, "rewards/rejected": 1.7515074014663696, "step": 3666 }, { "epoch": 0.81, "learning_rate": 6.732350786767726e-06, "logits/chosen": -1.6150367259979248, "logits/rejected": -1.5221832990646362, "logps/chosen": -42.55317687988281, "logps/rejected": -29.6353702545166, "loss": 2.0252, "rewards/accuracies": 1.0, "rewards/chosen": 4.411257266998291, "rewards/margins": 3.154496669769287, "rewards/rejected": 1.256760597229004, "step": 3667 }, { "epoch": 0.81, "learning_rate": 6.730669362443147e-06, "logits/chosen": -1.3482431173324585, "logits/rejected": -1.3482431173324585, "logps/chosen": -61.132686614990234, "logps/rejected": -61.132686614990234, "loss": 0.3703, "rewards/accuracies": 0.0, "rewards/chosen": 4.061014175415039, "rewards/margins": 0.0, "rewards/rejected": 4.061014175415039, "step": 3668 }, { "epoch": 0.81, "learning_rate": 6.728987715731712e-06, "logits/chosen": -1.6753225326538086, "logits/rejected": -1.6515675783157349, "logps/chosen": -71.60706329345703, "logps/rejected": -67.52708435058594, "loss": 0.2181, "rewards/accuracies": 1.0, "rewards/chosen": 5.309754848480225, "rewards/margins": 1.777958631515503, "rewards/rejected": 3.5317962169647217, "step": 3669 }, { "epoch": 0.81, "learning_rate": 6.727305846849513e-06, "logits/chosen": -1.4532556533813477, "logits/rejected": -1.4200164079666138, "logps/chosen": -71.65863037109375, "logps/rejected": -44.32451629638672, "loss": 0.7794, "rewards/accuracies": 0.0, "rewards/chosen": 2.2371277809143066, "rewards/margins": -1.2575247287750244, "rewards/rejected": 3.494652509689331, "step": 3670 }, { "epoch": 0.81, "learning_rate": 6.725623756012667e-06, "logits/chosen": -1.318418264389038, "logits/rejected": -1.3130378723144531, "logps/chosen": -34.551883697509766, "logps/rejected": -38.468353271484375, "loss": 0.5446, "rewards/accuracies": 0.0, "rewards/chosen": 1.1878544092178345, "rewards/margins": -0.5776653289794922, "rewards/rejected": 1.7655197381973267, "step": 3671 }, { "epoch": 0.81, "learning_rate": 6.7239414434373144e-06, "logits/chosen": -1.6253148317337036, "logits/rejected": -1.6095857620239258, "logps/chosen": -64.76895141601562, "logps/rejected": -56.63463592529297, "loss": 0.4491, "rewards/accuracies": 0.0, "rewards/chosen": 2.5291459560394287, "rewards/margins": -0.329038143157959, "rewards/rejected": 2.8581840991973877, "step": 3672 }, { "epoch": 0.81, "learning_rate": 6.722258909339632e-06, "logits/chosen": -1.7312560081481934, "logits/rejected": -1.6619127988815308, "logps/chosen": -115.65850830078125, "logps/rejected": -141.32650756835938, "loss": 0.2023, "rewards/accuracies": 1.0, "rewards/chosen": 8.009053230285645, "rewards/margins": 0.7271804809570312, "rewards/rejected": 7.281872749328613, "step": 3673 }, { "epoch": 0.81, "learning_rate": 6.720576153935818e-06, "logits/chosen": -1.7594233751296997, "logits/rejected": -1.778562068939209, "logps/chosen": -38.363250732421875, "logps/rejected": -52.879913330078125, "loss": 0.4876, "rewards/accuracies": 0.0, "rewards/chosen": 2.4149487018585205, "rewards/margins": -0.49277305603027344, "rewards/rejected": 2.907721757888794, "step": 3674 }, { "epoch": 0.81, "learning_rate": 6.718893177442105e-06, "logits/chosen": -1.721476435661316, "logits/rejected": -1.6939131021499634, "logps/chosen": -38.456661224365234, "logps/rejected": -71.73086547851562, "loss": 0.4303, "rewards/accuracies": 0.0, "rewards/chosen": 4.134634017944336, "rewards/margins": -0.20158910751342773, "rewards/rejected": 4.336223125457764, "step": 3675 }, { "epoch": 0.81, "learning_rate": 6.717209980074752e-06, "logits/chosen": -1.3808155059814453, "logits/rejected": -1.3704428672790527, "logps/chosen": -45.194679260253906, "logps/rejected": -37.24723815917969, "loss": 0.6491, "rewards/accuracies": 0.0, "rewards/chosen": 3.3375351428985596, "rewards/margins": -0.18819117546081543, "rewards/rejected": 3.525726318359375, "step": 3676 }, { "epoch": 0.81, "learning_rate": 6.715526562050044e-06, "logits/chosen": -1.3465688228607178, "logits/rejected": -1.2259173393249512, "logps/chosen": -69.39741516113281, "logps/rejected": -100.85236358642578, "loss": 0.4491, "rewards/accuracies": 1.0, "rewards/chosen": 4.180174350738525, "rewards/margins": 1.213416337966919, "rewards/rejected": 2.9667580127716064, "step": 3677 }, { "epoch": 0.81, "learning_rate": 6.713842923584297e-06, "logits/chosen": -1.5456238985061646, "logits/rejected": -1.5345686674118042, "logps/chosen": -54.22459411621094, "logps/rejected": -64.81488037109375, "loss": 1.1573, "rewards/accuracies": 0.0, "rewards/chosen": 2.6096389293670654, "rewards/margins": -2.207423448562622, "rewards/rejected": 4.8170623779296875, "step": 3678 }, { "epoch": 0.81, "learning_rate": 6.712159064893854e-06, "logits/chosen": -1.2924550771713257, "logits/rejected": -1.430953860282898, "logps/chosen": -38.875057220458984, "logps/rejected": -117.40275573730469, "loss": 2.3915, "rewards/accuracies": 0.0, "rewards/chosen": 2.147048234939575, "rewards/margins": -4.772795677185059, "rewards/rejected": 6.919844150543213, "step": 3679 }, { "epoch": 0.81, "learning_rate": 6.710474986195087e-06, "logits/chosen": -1.5417505502700806, "logits/rejected": -1.5737497806549072, "logps/chosen": -33.13710021972656, "logps/rejected": -48.77360534667969, "loss": 2.0095, "rewards/accuracies": 0.0, "rewards/chosen": 1.2338173389434814, "rewards/margins": -2.524350643157959, "rewards/rejected": 3.7581679821014404, "step": 3680 }, { "epoch": 0.81, "learning_rate": 6.708790687704397e-06, "logits/chosen": -1.5781422853469849, "logits/rejected": -1.4697422981262207, "logps/chosen": -149.4400177001953, "logps/rejected": -62.80524444580078, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 7.171908855438232, "rewards/margins": 3.3182153701782227, "rewards/rejected": 3.8536934852600098, "step": 3681 }, { "epoch": 0.81, "learning_rate": 6.70710616963821e-06, "logits/chosen": -1.5149810314178467, "logits/rejected": -1.3955802917480469, "logps/chosen": -107.57190704345703, "logps/rejected": -55.5540885925293, "loss": 0.2317, "rewards/accuracies": 1.0, "rewards/chosen": 5.55397891998291, "rewards/margins": 2.441871404647827, "rewards/rejected": 3.112107515335083, "step": 3682 }, { "epoch": 0.82, "learning_rate": 6.705421432212984e-06, "logits/chosen": -1.6309590339660645, "logits/rejected": -1.632978916168213, "logps/chosen": -41.23777389526367, "logps/rejected": -34.88527297973633, "loss": 0.9425, "rewards/accuracies": 0.0, "rewards/chosen": 2.2249934673309326, "rewards/margins": -1.1314752101898193, "rewards/rejected": 3.356468677520752, "step": 3683 }, { "epoch": 0.82, "learning_rate": 6.703736475645205e-06, "logits/chosen": -1.7704023122787476, "logits/rejected": -1.7623565196990967, "logps/chosen": -50.341827392578125, "logps/rejected": -138.16485595703125, "loss": 1.4265, "rewards/accuracies": 0.0, "rewards/chosen": 3.3111572265625, "rewards/margins": -2.7407565116882324, "rewards/rejected": 6.051913738250732, "step": 3684 }, { "epoch": 0.82, "learning_rate": 6.702051300151384e-06, "logits/chosen": -1.4614710807800293, "logits/rejected": -1.5656592845916748, "logps/chosen": -79.99293518066406, "logps/rejected": -106.50047302246094, "loss": 2.8385, "rewards/accuracies": 0.0, "rewards/chosen": 3.107506513595581, "rewards/margins": -5.672919273376465, "rewards/rejected": 8.780426025390625, "step": 3685 }, { "epoch": 0.82, "learning_rate": 6.700365905948063e-06, "logits/chosen": -1.6126278638839722, "logits/rejected": -1.5533807277679443, "logps/chosen": -114.543212890625, "logps/rejected": -140.93295288085938, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 5.606337070465088, "rewards/margins": 0.7253952026367188, "rewards/rejected": 4.880941867828369, "step": 3686 }, { "epoch": 0.82, "learning_rate": 6.698680293251809e-06, "logits/chosen": -1.8114681243896484, "logits/rejected": -1.7982347011566162, "logps/chosen": -36.83550262451172, "logps/rejected": -50.70556640625, "loss": 0.9901, "rewards/accuracies": 0.0, "rewards/chosen": 3.2006492614746094, "rewards/margins": -0.4783484935760498, "rewards/rejected": 3.678997755050659, "step": 3687 }, { "epoch": 0.82, "learning_rate": 6.696994462279223e-06, "logits/chosen": -1.3522582054138184, "logits/rejected": -1.3571034669876099, "logps/chosen": -69.4445571899414, "logps/rejected": -39.59064483642578, "loss": 0.6161, "rewards/accuracies": 0.0, "rewards/chosen": 3.6715660095214844, "rewards/margins": -0.11208033561706543, "rewards/rejected": 3.78364634513855, "step": 3688 }, { "epoch": 0.82, "learning_rate": 6.695308413246926e-06, "logits/chosen": -1.3632460832595825, "logits/rejected": -1.301261067390442, "logps/chosen": -78.0068359375, "logps/rejected": -83.00350189208984, "loss": 0.4287, "rewards/accuracies": 0.0, "rewards/chosen": 4.233521938323975, "rewards/margins": -0.13546466827392578, "rewards/rejected": 4.3689866065979, "step": 3689 }, { "epoch": 0.82, "learning_rate": 6.693622146371574e-06, "logits/chosen": -1.1546084880828857, "logits/rejected": -1.5008655786514282, "logps/chosen": -12.889904975891113, "logps/rejected": -61.25729751586914, "loss": 1.4507, "rewards/accuracies": 0.0, "rewards/chosen": 0.9300698637962341, "rewards/margins": -1.9220750331878662, "rewards/rejected": 2.852144956588745, "step": 3690 }, { "epoch": 0.82, "learning_rate": 6.6919356618698485e-06, "logits/chosen": -1.6024084091186523, "logits/rejected": -1.6304781436920166, "logps/chosen": -119.79559326171875, "logps/rejected": -109.16399383544922, "loss": 0.9838, "rewards/accuracies": 0.0, "rewards/chosen": 5.96101713180542, "rewards/margins": -1.747598648071289, "rewards/rejected": 7.708615779876709, "step": 3691 }, { "epoch": 0.82, "learning_rate": 6.690248959958458e-06, "logits/chosen": -1.3135383129119873, "logits/rejected": -1.2188127040863037, "logps/chosen": -33.93080139160156, "logps/rejected": -11.768856048583984, "loss": 0.489, "rewards/accuracies": 1.0, "rewards/chosen": 1.9247184991836548, "rewards/margins": 1.1099166870117188, "rewards/rejected": 0.814801812171936, "step": 3692 }, { "epoch": 0.82, "learning_rate": 6.688562040854137e-06, "logits/chosen": -1.5741093158721924, "logits/rejected": -1.6884413957595825, "logps/chosen": -63.385982513427734, "logps/rejected": -37.46989059448242, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 3.0567739009857178, "rewards/margins": 1.4356175661087036, "rewards/rejected": 1.6211563348770142, "step": 3693 }, { "epoch": 0.82, "learning_rate": 6.686874904773656e-06, "logits/chosen": -1.3276245594024658, "logits/rejected": -1.3224788904190063, "logps/chosen": -36.327762603759766, "logps/rejected": -37.8876838684082, "loss": 1.6437, "rewards/accuracies": 0.0, "rewards/chosen": 2.506776809692383, "rewards/margins": -0.610443115234375, "rewards/rejected": 3.117219924926758, "step": 3694 }, { "epoch": 0.82, "learning_rate": 6.685187551933802e-06, "logits/chosen": -1.4545743465423584, "logits/rejected": -1.373967170715332, "logps/chosen": -53.88520050048828, "logps/rejected": -51.571807861328125, "loss": 0.7132, "rewards/accuracies": 0.0, "rewards/chosen": 2.896782636642456, "rewards/margins": -1.1111199855804443, "rewards/rejected": 4.0079026222229, "step": 3695 }, { "epoch": 0.82, "learning_rate": 6.6834999825514004e-06, "logits/chosen": -1.342415452003479, "logits/rejected": -1.3073208332061768, "logps/chosen": -40.594970703125, "logps/rejected": -103.33901977539062, "loss": 1.0255, "rewards/accuracies": 0.0, "rewards/chosen": 2.4128806591033936, "rewards/margins": -0.9351174831390381, "rewards/rejected": 3.3479981422424316, "step": 3696 }, { "epoch": 0.82, "learning_rate": 6.681812196843298e-06, "logits/chosen": -1.7822586297988892, "logits/rejected": -1.7388293743133545, "logps/chosen": -169.52676391601562, "logps/rejected": -186.42160034179688, "loss": 0.2557, "rewards/accuracies": 1.0, "rewards/chosen": 8.597304344177246, "rewards/margins": 0.4957447052001953, "rewards/rejected": 8.10155963897705, "step": 3697 }, { "epoch": 0.82, "learning_rate": 6.68012419502637e-06, "logits/chosen": -1.4667476415634155, "logits/rejected": -1.368913173675537, "logps/chosen": -51.40321350097656, "logps/rejected": -68.64652252197266, "loss": 0.3066, "rewards/accuracies": 1.0, "rewards/chosen": 3.147818088531494, "rewards/margins": 0.29797816276550293, "rewards/rejected": 2.849839925765991, "step": 3698 }, { "epoch": 0.82, "learning_rate": 6.678435977317524e-06, "logits/chosen": -1.5675588846206665, "logits/rejected": -1.6637994050979614, "logps/chosen": -33.85390090942383, "logps/rejected": -141.90814208984375, "loss": 4.4688, "rewards/accuracies": 0.0, "rewards/chosen": 2.734419584274292, "rewards/margins": -6.750489234924316, "rewards/rejected": 9.484909057617188, "step": 3699 }, { "epoch": 0.82, "learning_rate": 6.676747543933687e-06, "logits/chosen": -1.3841062784194946, "logits/rejected": -1.36373770236969, "logps/chosen": -140.133544921875, "logps/rejected": -145.02621459960938, "loss": 2.2144, "rewards/accuracies": 1.0, "rewards/chosen": 6.480133056640625, "rewards/margins": 0.4489898681640625, "rewards/rejected": 6.0311431884765625, "step": 3700 }, { "epoch": 0.82, "learning_rate": 6.675058895091824e-06, "logits/chosen": -1.5264856815338135, "logits/rejected": -1.4719372987747192, "logps/chosen": -85.85176849365234, "logps/rejected": -66.99882507324219, "loss": 3.4977, "rewards/accuracies": 0.0, "rewards/chosen": 1.209581732749939, "rewards/margins": -1.6094201803207397, "rewards/rejected": 2.8190019130706787, "step": 3701 }, { "epoch": 0.82, "learning_rate": 6.673370031008919e-06, "logits/chosen": -1.5156551599502563, "logits/rejected": -1.5593526363372803, "logps/chosen": -111.39972686767578, "logps/rejected": -95.63441467285156, "loss": 1.996, "rewards/accuracies": 0.0, "rewards/chosen": 5.9236016273498535, "rewards/margins": -2.327643871307373, "rewards/rejected": 8.251245498657227, "step": 3702 }, { "epoch": 0.82, "learning_rate": 6.671680951901988e-06, "logits/chosen": -1.279392957687378, "logits/rejected": -1.2947795391082764, "logps/chosen": -67.87118530273438, "logps/rejected": -51.53181457519531, "loss": 1.8161, "rewards/accuracies": 0.0, "rewards/chosen": 2.7483766078948975, "rewards/margins": -0.996208906173706, "rewards/rejected": 3.7445855140686035, "step": 3703 }, { "epoch": 0.82, "learning_rate": 6.669991657988072e-06, "logits/chosen": -1.5770764350891113, "logits/rejected": -1.4801597595214844, "logps/chosen": -46.98571014404297, "logps/rejected": -52.58872985839844, "loss": 0.3677, "rewards/accuracies": 1.0, "rewards/chosen": 3.527608633041382, "rewards/margins": 0.4089202880859375, "rewards/rejected": 3.1186883449554443, "step": 3704 }, { "epoch": 0.82, "learning_rate": 6.668302149484242e-06, "logits/chosen": -1.7351738214492798, "logits/rejected": -1.7472450733184814, "logps/chosen": -57.744136810302734, "logps/rejected": -70.51664733886719, "loss": 1.5024, "rewards/accuracies": 0.0, "rewards/chosen": 2.2944705486297607, "rewards/margins": -2.7256457805633545, "rewards/rejected": 5.020116329193115, "step": 3705 }, { "epoch": 0.82, "learning_rate": 6.6666124266075986e-06, "logits/chosen": -1.0816540718078613, "logits/rejected": -1.0816540718078613, "logps/chosen": -3.606687545776367, "logps/rejected": -3.606687545776367, "loss": 0.3488, "rewards/accuracies": 0.0, "rewards/chosen": 1.0170389413833618, "rewards/margins": 0.0, "rewards/rejected": 1.0170389413833618, "step": 3706 }, { "epoch": 0.82, "learning_rate": 6.6649224895752625e-06, "logits/chosen": -1.6678539514541626, "logits/rejected": -1.5120643377304077, "logps/chosen": -122.52325439453125, "logps/rejected": -42.17120361328125, "loss": 0.5251, "rewards/accuracies": 1.0, "rewards/chosen": 9.501025199890137, "rewards/margins": 5.966734886169434, "rewards/rejected": 3.534290313720703, "step": 3707 }, { "epoch": 0.82, "learning_rate": 6.6632323386043896e-06, "logits/chosen": -1.5805994272232056, "logits/rejected": -1.590625286102295, "logps/chosen": -46.96345520019531, "logps/rejected": -38.580352783203125, "loss": 1.2785, "rewards/accuracies": 0.0, "rewards/chosen": 2.0014023780822754, "rewards/margins": -1.3309500217437744, "rewards/rejected": 3.33235239982605, "step": 3708 }, { "epoch": 0.82, "learning_rate": 6.66154197391216e-06, "logits/chosen": -1.2867275476455688, "logits/rejected": -1.2491083145141602, "logps/chosen": -111.49440002441406, "logps/rejected": -69.87901306152344, "loss": 0.5919, "rewards/accuracies": 1.0, "rewards/chosen": 3.5399062633514404, "rewards/margins": 0.5524024963378906, "rewards/rejected": 2.98750376701355, "step": 3709 }, { "epoch": 0.82, "learning_rate": 6.659851395715782e-06, "logits/chosen": -1.5217379331588745, "logits/rejected": -1.5143996477127075, "logps/chosen": -109.80810546875, "logps/rejected": -69.62590026855469, "loss": 1.47, "rewards/accuracies": 0.0, "rewards/chosen": 6.558082580566406, "rewards/margins": -0.2955293655395508, "rewards/rejected": 6.853611946105957, "step": 3710 }, { "epoch": 0.82, "learning_rate": 6.6581606042324875e-06, "logits/chosen": -1.782021403312683, "logits/rejected": -1.7920007705688477, "logps/chosen": -87.35791778564453, "logps/rejected": -58.87158203125, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 4.818221569061279, "rewards/margins": 2.5018625259399414, "rewards/rejected": 2.316359043121338, "step": 3711 }, { "epoch": 0.82, "learning_rate": 6.656469599679543e-06, "logits/chosen": -1.6568591594696045, "logits/rejected": -1.4099334478378296, "logps/chosen": -42.18572998046875, "logps/rejected": -32.08080291748047, "loss": 1.741, "rewards/accuracies": 1.0, "rewards/chosen": 2.4846572875976562, "rewards/margins": 0.15231013298034668, "rewards/rejected": 2.3323471546173096, "step": 3712 }, { "epoch": 0.82, "learning_rate": 6.654778382274237e-06, "logits/chosen": -1.9870827198028564, "logits/rejected": -1.968591332435608, "logps/chosen": -45.60835266113281, "logps/rejected": -20.494935989379883, "loss": 0.2819, "rewards/accuracies": 1.0, "rewards/chosen": 2.8482024669647217, "rewards/margins": 2.170797348022461, "rewards/rejected": 0.6774051785469055, "step": 3713 }, { "epoch": 0.82, "learning_rate": 6.653086952233887e-06, "logits/chosen": -1.459631323814392, "logits/rejected": -1.5144966840744019, "logps/chosen": -84.23485565185547, "logps/rejected": -101.79393005371094, "loss": 2.3221, "rewards/accuracies": 0.0, "rewards/chosen": 4.2680583000183105, "rewards/margins": -4.633438587188721, "rewards/rejected": 8.901496887207031, "step": 3714 }, { "epoch": 0.82, "learning_rate": 6.651395309775837e-06, "logits/chosen": -1.415709137916565, "logits/rejected": -1.587921380996704, "logps/chosen": -36.26760482788086, "logps/rejected": -94.75798797607422, "loss": 2.1175, "rewards/accuracies": 0.0, "rewards/chosen": 2.9265429973602295, "rewards/margins": -3.8415348529815674, "rewards/rejected": 6.768077850341797, "step": 3715 }, { "epoch": 0.82, "learning_rate": 6.6497034551174585e-06, "logits/chosen": -1.876475214958191, "logits/rejected": -1.8295977115631104, "logps/chosen": -57.180213928222656, "logps/rejected": -20.317419052124023, "loss": 0.2784, "rewards/accuracies": 1.0, "rewards/chosen": 2.443514347076416, "rewards/margins": 0.4442640542984009, "rewards/rejected": 1.9992502927780151, "step": 3716 }, { "epoch": 0.82, "learning_rate": 6.648011388476152e-06, "logits/chosen": -1.4844136238098145, "logits/rejected": -1.4979666471481323, "logps/chosen": -63.03192901611328, "logps/rejected": -40.836185455322266, "loss": 0.2692, "rewards/accuracies": 1.0, "rewards/chosen": 2.5410141944885254, "rewards/margins": 0.360736608505249, "rewards/rejected": 2.1802775859832764, "step": 3717 }, { "epoch": 0.82, "learning_rate": 6.646319110069346e-06, "logits/chosen": -1.574912428855896, "logits/rejected": -1.5614477396011353, "logps/chosen": -66.93302917480469, "logps/rejected": -66.31550598144531, "loss": 0.6869, "rewards/accuracies": 0.0, "rewards/chosen": 3.878769636154175, "rewards/margins": -1.0789835453033447, "rewards/rejected": 4.9577531814575195, "step": 3718 }, { "epoch": 0.82, "learning_rate": 6.64462662011449e-06, "logits/chosen": -1.4116160869598389, "logits/rejected": -1.3907886743545532, "logps/chosen": -74.76643371582031, "logps/rejected": -74.58500671386719, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 5.783133029937744, "rewards/margins": 0.7780790328979492, "rewards/rejected": 5.005053997039795, "step": 3719 }, { "epoch": 0.82, "learning_rate": 6.642933918829067e-06, "logits/chosen": -1.8647253513336182, "logits/rejected": -1.8646384477615356, "logps/chosen": -63.423362731933594, "logps/rejected": -67.19599151611328, "loss": 1.0537, "rewards/accuracies": 0.0, "rewards/chosen": 5.0245280265808105, "rewards/margins": -1.135728359222412, "rewards/rejected": 6.160256385803223, "step": 3720 }, { "epoch": 0.82, "learning_rate": 6.641241006430586e-06, "logits/chosen": -1.714017391204834, "logits/rejected": -1.6636393070220947, "logps/chosen": -83.44183349609375, "logps/rejected": -64.17529296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 9.067278861999512, "rewards/margins": 6.590965270996094, "rewards/rejected": 2.476313829421997, "step": 3721 }, { "epoch": 0.82, "learning_rate": 6.639547883136581e-06, "logits/chosen": -1.4657022953033447, "logits/rejected": -1.3758759498596191, "logps/chosen": -160.58229064941406, "logps/rejected": -85.32164001464844, "loss": 0.0965, "rewards/accuracies": 1.0, "rewards/chosen": 7.031134128570557, "rewards/margins": 3.3699357509613037, "rewards/rejected": 3.661198377609253, "step": 3722 }, { "epoch": 0.82, "learning_rate": 6.637854549164614e-06, "logits/chosen": -1.862083911895752, "logits/rejected": -1.8479102849960327, "logps/chosen": -46.06120681762695, "logps/rejected": -80.06761169433594, "loss": 0.2753, "rewards/accuracies": 1.0, "rewards/chosen": 3.5433452129364014, "rewards/margins": 0.3865840435028076, "rewards/rejected": 3.1567611694335938, "step": 3723 }, { "epoch": 0.82, "learning_rate": 6.636161004732274e-06, "logits/chosen": -1.2949777841567993, "logits/rejected": -1.3547707796096802, "logps/chosen": -24.337675094604492, "logps/rejected": -100.63478088378906, "loss": 1.6218, "rewards/accuracies": 0.0, "rewards/chosen": 2.582280158996582, "rewards/margins": -2.76768159866333, "rewards/rejected": 5.349961757659912, "step": 3724 }, { "epoch": 0.82, "learning_rate": 6.634467250057179e-06, "logits/chosen": -1.6245791912078857, "logits/rejected": -1.590955376625061, "logps/chosen": -50.27980041503906, "logps/rejected": -57.54869842529297, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 4.036398410797119, "rewards/margins": 2.294957160949707, "rewards/rejected": 1.7414413690567017, "step": 3725 }, { "epoch": 0.82, "learning_rate": 6.63277328535697e-06, "logits/chosen": -1.4775669574737549, "logits/rejected": -1.4030739068984985, "logps/chosen": -52.500389099121094, "logps/rejected": -97.87779998779297, "loss": 0.6755, "rewards/accuracies": 1.0, "rewards/chosen": 2.666586399078369, "rewards/margins": 0.26778721809387207, "rewards/rejected": 2.398799180984497, "step": 3726 }, { "epoch": 0.82, "learning_rate": 6.631079110849322e-06, "logits/chosen": -1.298406958580017, "logits/rejected": -1.2728114128112793, "logps/chosen": -39.79609680175781, "logps/rejected": -50.15137481689453, "loss": 0.5216, "rewards/accuracies": 1.0, "rewards/chosen": 1.746419906616211, "rewards/margins": 0.719098687171936, "rewards/rejected": 1.027321219444275, "step": 3727 }, { "epoch": 0.83, "learning_rate": 6.6293847267519275e-06, "logits/chosen": -1.6510939598083496, "logits/rejected": -1.6306663751602173, "logps/chosen": -42.518470764160156, "logps/rejected": -35.07581329345703, "loss": 2.0551, "rewards/accuracies": 0.0, "rewards/chosen": 2.006446599960327, "rewards/margins": -2.5033938884735107, "rewards/rejected": 4.509840488433838, "step": 3728 }, { "epoch": 0.83, "learning_rate": 6.6276901332825115e-06, "logits/chosen": -1.5122376680374146, "logits/rejected": -1.4835116863250732, "logps/chosen": -96.49565887451172, "logps/rejected": -119.59721374511719, "loss": 0.0728, "rewards/accuracies": 1.0, "rewards/chosen": 6.919818878173828, "rewards/margins": 2.005460262298584, "rewards/rejected": 4.914358615875244, "step": 3729 }, { "epoch": 0.83, "learning_rate": 6.625995330658828e-06, "logits/chosen": -1.5418301820755005, "logits/rejected": -1.4725455045700073, "logps/chosen": -201.52520751953125, "logps/rejected": -98.69369506835938, "loss": 0.3055, "rewards/accuracies": 1.0, "rewards/chosen": 6.3411865234375, "rewards/margins": 0.17183828353881836, "rewards/rejected": 6.169348239898682, "step": 3730 }, { "epoch": 0.83, "learning_rate": 6.6243003190986525e-06, "logits/chosen": -1.4731146097183228, "logits/rejected": -1.4731146097183228, "logps/chosen": -37.086029052734375, "logps/rejected": -37.086029052734375, "loss": 0.347, "rewards/accuracies": 0.0, "rewards/chosen": 2.7527923583984375, "rewards/margins": 0.0, "rewards/rejected": 2.7527923583984375, "step": 3731 }, { "epoch": 0.83, "learning_rate": 6.622605098819791e-06, "logits/chosen": -1.5658549070358276, "logits/rejected": -1.4505635499954224, "logps/chosen": -136.97421264648438, "logps/rejected": -45.85251998901367, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 6.512490749359131, "rewards/margins": 3.690606117248535, "rewards/rejected": 2.8218846321105957, "step": 3732 }, { "epoch": 0.83, "learning_rate": 6.620909670040074e-06, "logits/chosen": -1.8180792331695557, "logits/rejected": -1.7231500148773193, "logps/chosen": -32.22995376586914, "logps/rejected": -8.163932800292969, "loss": 0.0605, "rewards/accuracies": 1.0, "rewards/chosen": 4.388972282409668, "rewards/margins": 2.4557371139526367, "rewards/rejected": 1.9332351684570312, "step": 3733 }, { "epoch": 0.83, "learning_rate": 6.619214032977361e-06, "logits/chosen": -1.7469953298568726, "logits/rejected": -1.6084158420562744, "logps/chosen": -129.85183715820312, "logps/rejected": -25.417951583862305, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": 5.667481899261475, "rewards/margins": 4.139155387878418, "rewards/rejected": 1.5283266305923462, "step": 3734 }, { "epoch": 0.83, "learning_rate": 6.617518187849539e-06, "logits/chosen": -1.7610286474227905, "logits/rejected": -1.8114598989486694, "logps/chosen": -57.8201904296875, "logps/rejected": -108.27119445800781, "loss": 1.2556, "rewards/accuracies": 0.0, "rewards/chosen": 4.492240905761719, "rewards/margins": -2.2373046875, "rewards/rejected": 6.729545593261719, "step": 3735 }, { "epoch": 0.83, "learning_rate": 6.615822134874517e-06, "logits/chosen": -1.8600424528121948, "logits/rejected": -1.622531771659851, "logps/chosen": -94.14089965820312, "logps/rejected": -23.470340728759766, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 5.816958904266357, "rewards/margins": 5.880756855010986, "rewards/rejected": -0.06379814445972443, "step": 3736 }, { "epoch": 0.83, "learning_rate": 6.614125874270235e-06, "logits/chosen": -1.4486875534057617, "logits/rejected": -1.522536277770996, "logps/chosen": -48.76986312866211, "logps/rejected": -89.4686279296875, "loss": 0.395, "rewards/accuracies": 0.0, "rewards/chosen": 4.038341999053955, "rewards/margins": -0.17592525482177734, "rewards/rejected": 4.214267253875732, "step": 3737 }, { "epoch": 0.83, "learning_rate": 6.612429406254659e-06, "logits/chosen": -1.6469660997390747, "logits/rejected": -1.6257588863372803, "logps/chosen": -101.10476684570312, "logps/rejected": -31.61172103881836, "loss": 0.4462, "rewards/accuracies": 1.0, "rewards/chosen": 4.433690071105957, "rewards/margins": 0.4894101619720459, "rewards/rejected": 3.944279909133911, "step": 3738 }, { "epoch": 0.83, "learning_rate": 6.610732731045779e-06, "logits/chosen": -1.6088857650756836, "logits/rejected": -1.5780447721481323, "logps/chosen": -91.61396789550781, "logps/rejected": -180.3712921142578, "loss": 0.7143, "rewards/accuracies": 0.0, "rewards/chosen": 7.1747636795043945, "rewards/margins": -1.1520261764526367, "rewards/rejected": 8.326789855957031, "step": 3739 }, { "epoch": 0.83, "learning_rate": 6.609035848861616e-06, "logits/chosen": -1.594772458076477, "logits/rejected": -1.5110129117965698, "logps/chosen": -44.3434944152832, "logps/rejected": -39.14865493774414, "loss": 0.5245, "rewards/accuracies": 0.0, "rewards/chosen": 2.7891323566436768, "rewards/margins": -0.5599837303161621, "rewards/rejected": 3.349116086959839, "step": 3740 }, { "epoch": 0.83, "learning_rate": 6.607338759920214e-06, "logits/chosen": -1.471877098083496, "logits/rejected": -1.5523613691329956, "logps/chosen": -33.090301513671875, "logps/rejected": -89.00343322753906, "loss": 1.4417, "rewards/accuracies": 0.0, "rewards/chosen": 1.7259712219238281, "rewards/margins": -2.8094096183776855, "rewards/rejected": 4.535380840301514, "step": 3741 }, { "epoch": 0.83, "learning_rate": 6.605641464439647e-06, "logits/chosen": -1.5378262996673584, "logits/rejected": -1.4377456903457642, "logps/chosen": -156.24502563476562, "logps/rejected": -12.15239429473877, "loss": 0.0761, "rewards/accuracies": 1.0, "rewards/chosen": 5.3120436668396, "rewards/margins": 4.908335208892822, "rewards/rejected": 0.4037083685398102, "step": 3742 }, { "epoch": 0.83, "learning_rate": 6.603943962638011e-06, "logits/chosen": -1.8389496803283691, "logits/rejected": -1.7643271684646606, "logps/chosen": -106.79890441894531, "logps/rejected": -50.96330642700195, "loss": 0.652, "rewards/accuracies": 1.0, "rewards/chosen": 5.6423020362854, "rewards/margins": 1.3524885177612305, "rewards/rejected": 4.28981351852417, "step": 3743 }, { "epoch": 0.83, "learning_rate": 6.602246254733431e-06, "logits/chosen": -1.413193941116333, "logits/rejected": -1.2984564304351807, "logps/chosen": -51.765525817871094, "logps/rejected": -30.30659294128418, "loss": 1.7916, "rewards/accuracies": 0.0, "rewards/chosen": 2.803138017654419, "rewards/margins": -1.0945899486541748, "rewards/rejected": 3.8977279663085938, "step": 3744 }, { "epoch": 0.83, "learning_rate": 6.60054834094406e-06, "logits/chosen": -1.6870243549346924, "logits/rejected": -1.7195215225219727, "logps/chosen": -82.2253646850586, "logps/rejected": -132.3787841796875, "loss": 0.48, "rewards/accuracies": 1.0, "rewards/chosen": 7.4354472160339355, "rewards/margins": 0.1743755340576172, "rewards/rejected": 7.261071681976318, "step": 3745 }, { "epoch": 0.83, "learning_rate": 6.598850221488073e-06, "logits/chosen": -2.035421371459961, "logits/rejected": -1.9207637310028076, "logps/chosen": -156.06982421875, "logps/rejected": -28.718082427978516, "loss": 1.1213, "rewards/accuracies": 1.0, "rewards/chosen": 6.401383876800537, "rewards/margins": 4.59800910949707, "rewards/rejected": 1.8033745288848877, "step": 3746 }, { "epoch": 0.83, "learning_rate": 6.597151896583677e-06, "logits/chosen": -1.5080091953277588, "logits/rejected": -1.5092575550079346, "logps/chosen": -61.184242248535156, "logps/rejected": -64.36994171142578, "loss": 1.2739, "rewards/accuracies": 0.0, "rewards/chosen": 2.4948227405548096, "rewards/margins": -2.4498274326324463, "rewards/rejected": 4.944650173187256, "step": 3747 }, { "epoch": 0.83, "learning_rate": 6.595453366449101e-06, "logits/chosen": -1.4853023290634155, "logits/rejected": -1.6311472654342651, "logps/chosen": -46.6015625, "logps/rejected": -26.929018020629883, "loss": 0.2446, "rewards/accuracies": 1.0, "rewards/chosen": 3.1930344104766846, "rewards/margins": 0.6355772018432617, "rewards/rejected": 2.557457208633423, "step": 3748 }, { "epoch": 0.83, "learning_rate": 6.593754631302603e-06, "logits/chosen": -1.5485389232635498, "logits/rejected": -1.5941587686538696, "logps/chosen": -41.22688674926758, "logps/rejected": -55.321136474609375, "loss": 1.9831, "rewards/accuracies": 0.0, "rewards/chosen": 1.8331329822540283, "rewards/margins": -2.903660535812378, "rewards/rejected": 4.736793518066406, "step": 3749 }, { "epoch": 0.83, "learning_rate": 6.592055691362467e-06, "logits/chosen": -1.6933186054229736, "logits/rejected": -1.573819637298584, "logps/chosen": -44.314632415771484, "logps/rejected": -11.553329467773438, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 4.305251121520996, "rewards/margins": 3.348025321960449, "rewards/rejected": 0.9572257995605469, "step": 3750 }, { "epoch": 0.83, "learning_rate": 6.590356546847002e-06, "logits/chosen": -1.7776480913162231, "logits/rejected": -1.6404109001159668, "logps/chosen": -115.63959503173828, "logps/rejected": -52.24967575073242, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 4.841797828674316, "rewards/margins": 2.5453298091888428, "rewards/rejected": 2.2964680194854736, "step": 3751 }, { "epoch": 0.83, "learning_rate": 6.588657197974544e-06, "logits/chosen": -1.8805242776870728, "logits/rejected": -1.7656629085540771, "logps/chosen": -81.44532775878906, "logps/rejected": -38.570465087890625, "loss": 0.4048, "rewards/accuracies": 1.0, "rewards/chosen": 4.8997802734375, "rewards/margins": 0.24656915664672852, "rewards/rejected": 4.6532111167907715, "step": 3752 }, { "epoch": 0.83, "learning_rate": 6.586957644963454e-06, "logits/chosen": -1.5659797191619873, "logits/rejected": -1.5721455812454224, "logps/chosen": -73.99716186523438, "logps/rejected": -67.06648254394531, "loss": 1.1147, "rewards/accuracies": 0.0, "rewards/chosen": 2.1095893383026123, "rewards/margins": -2.106548547744751, "rewards/rejected": 4.216137886047363, "step": 3753 }, { "epoch": 0.83, "learning_rate": 6.585257888032123e-06, "logits/chosen": -1.5338616371154785, "logits/rejected": -1.426485300064087, "logps/chosen": -82.67147064208984, "logps/rejected": -42.5308723449707, "loss": 0.9328, "rewards/accuracies": 1.0, "rewards/chosen": 5.790316104888916, "rewards/margins": 2.4971187114715576, "rewards/rejected": 3.2931973934173584, "step": 3754 }, { "epoch": 0.83, "learning_rate": 6.583557927398962e-06, "logits/chosen": -1.382710576057434, "logits/rejected": -1.350295066833496, "logps/chosen": -75.5118408203125, "logps/rejected": -111.31688690185547, "loss": 0.5084, "rewards/accuracies": 1.0, "rewards/chosen": 2.739732503890991, "rewards/margins": 2.0139520168304443, "rewards/rejected": 0.7257804870605469, "step": 3755 }, { "epoch": 0.83, "learning_rate": 6.581857763282416e-06, "logits/chosen": -1.5246020555496216, "logits/rejected": -1.5019506216049194, "logps/chosen": -92.31231689453125, "logps/rejected": -68.08284759521484, "loss": 1.5333, "rewards/accuracies": 0.0, "rewards/chosen": 5.558362007141113, "rewards/margins": -3.0152111053466797, "rewards/rejected": 8.573573112487793, "step": 3756 }, { "epoch": 0.83, "learning_rate": 6.580157395900949e-06, "logits/chosen": -1.3731215000152588, "logits/rejected": -1.3223202228546143, "logps/chosen": -25.315322875976562, "logps/rejected": -6.752137184143066, "loss": 0.3448, "rewards/accuracies": 1.0, "rewards/chosen": 2.265676259994507, "rewards/margins": 0.5939278602600098, "rewards/rejected": 1.671748399734497, "step": 3757 }, { "epoch": 0.83, "learning_rate": 6.578456825473055e-06, "logits/chosen": -1.6463444232940674, "logits/rejected": -1.6379979848861694, "logps/chosen": -62.32428741455078, "logps/rejected": -57.83921432495117, "loss": 0.2093, "rewards/accuracies": 1.0, "rewards/chosen": 3.8993966579437256, "rewards/margins": 0.6733455657958984, "rewards/rejected": 3.226051092147827, "step": 3758 }, { "epoch": 0.83, "learning_rate": 6.5767560522172535e-06, "logits/chosen": -1.7431058883666992, "logits/rejected": -1.690545916557312, "logps/chosen": -93.13543701171875, "logps/rejected": -83.90153503417969, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": 7.8929595947265625, "rewards/margins": 3.581760883331299, "rewards/rejected": 4.311198711395264, "step": 3759 }, { "epoch": 0.83, "learning_rate": 6.575055076352091e-06, "logits/chosen": -1.6620697975158691, "logits/rejected": -1.6401492357254028, "logps/chosen": -60.17082595825195, "logps/rejected": -41.22169494628906, "loss": 0.4121, "rewards/accuracies": 0.0, "rewards/chosen": 2.732194185256958, "rewards/margins": -0.23141050338745117, "rewards/rejected": 2.963604688644409, "step": 3760 }, { "epoch": 0.83, "learning_rate": 6.573353898096137e-06, "logits/chosen": -1.6973249912261963, "logits/rejected": -1.6973249912261963, "logps/chosen": -45.17121887207031, "logps/rejected": -45.17121887207031, "loss": 0.3753, "rewards/accuracies": 0.0, "rewards/chosen": 1.8205994367599487, "rewards/margins": 0.0, "rewards/rejected": 1.8205994367599487, "step": 3761 }, { "epoch": 0.83, "learning_rate": 6.571652517667989e-06, "logits/chosen": -1.7296504974365234, "logits/rejected": -1.7542223930358887, "logps/chosen": -50.850162506103516, "logps/rejected": -78.57357788085938, "loss": 3.2728, "rewards/accuracies": 0.0, "rewards/chosen": 1.3677250146865845, "rewards/margins": -4.68595552444458, "rewards/rejected": 6.053680419921875, "step": 3762 }, { "epoch": 0.83, "learning_rate": 6.569950935286271e-06, "logits/chosen": -1.5263818502426147, "logits/rejected": -1.5032700300216675, "logps/chosen": -58.888465881347656, "logps/rejected": -38.06431198120117, "loss": 0.4525, "rewards/accuracies": 0.0, "rewards/chosen": 2.1478004455566406, "rewards/margins": -0.298905611038208, "rewards/rejected": 2.4467060565948486, "step": 3763 }, { "epoch": 0.83, "learning_rate": 6.568249151169632e-06, "logits/chosen": -1.6009385585784912, "logits/rejected": -1.595913290977478, "logps/chosen": -109.76663970947266, "logps/rejected": -150.99630737304688, "loss": 2.3815, "rewards/accuracies": 0.0, "rewards/chosen": 6.237606048583984, "rewards/margins": -4.197727203369141, "rewards/rejected": 10.435333251953125, "step": 3764 }, { "epoch": 0.83, "learning_rate": 6.566547165536747e-06, "logits/chosen": -1.8102763891220093, "logits/rejected": -1.7352430820465088, "logps/chosen": -97.65007019042969, "logps/rejected": -72.22572326660156, "loss": 1.1749, "rewards/accuracies": 0.0, "rewards/chosen": 1.716478705406189, "rewards/margins": -1.8789254426956177, "rewards/rejected": 3.5954041481018066, "step": 3765 }, { "epoch": 0.83, "learning_rate": 6.564844978606316e-06, "logits/chosen": -1.4413444995880127, "logits/rejected": -1.3613245487213135, "logps/chosen": -99.75186157226562, "logps/rejected": -65.13078308105469, "loss": 0.5596, "rewards/accuracies": 0.0, "rewards/chosen": 5.088452339172363, "rewards/margins": -0.4783658981323242, "rewards/rejected": 5.5668182373046875, "step": 3766 }, { "epoch": 0.83, "learning_rate": 6.563142590597067e-06, "logits/chosen": -1.4543653726577759, "logits/rejected": -1.4839248657226562, "logps/chosen": -48.2364501953125, "logps/rejected": -97.68934631347656, "loss": 0.5361, "rewards/accuracies": 0.0, "rewards/chosen": 3.4001145362854004, "rewards/margins": -0.5150024890899658, "rewards/rejected": 3.915117025375366, "step": 3767 }, { "epoch": 0.83, "learning_rate": 6.561440001727755e-06, "logits/chosen": -1.6431896686553955, "logits/rejected": -1.6063815355300903, "logps/chosen": -40.55340576171875, "logps/rejected": -56.20936965942383, "loss": 1.2914, "rewards/accuracies": 0.0, "rewards/chosen": 2.103595018386841, "rewards/margins": -0.8049435615539551, "rewards/rejected": 2.908538579940796, "step": 3768 }, { "epoch": 0.83, "learning_rate": 6.559737212217155e-06, "logits/chosen": -1.5932239294052124, "logits/rejected": -1.4866727590560913, "logps/chosen": -33.44709396362305, "logps/rejected": -48.251705169677734, "loss": 0.4165, "rewards/accuracies": 0.0, "rewards/chosen": 3.847383499145508, "rewards/margins": -0.07974410057067871, "rewards/rejected": 3.9271275997161865, "step": 3769 }, { "epoch": 0.83, "learning_rate": 6.558034222284072e-06, "logits/chosen": -1.6329998970031738, "logits/rejected": -1.51571786403656, "logps/chosen": -57.679046630859375, "logps/rejected": -46.20322036743164, "loss": 0.8739, "rewards/accuracies": 0.0, "rewards/chosen": 2.079502820968628, "rewards/margins": -1.5563023090362549, "rewards/rejected": 3.635805130004883, "step": 3770 }, { "epoch": 0.83, "learning_rate": 6.556331032147337e-06, "logits/chosen": -1.764376163482666, "logits/rejected": -1.7846592664718628, "logps/chosen": -49.48861312866211, "logps/rejected": -82.76014709472656, "loss": 1.8795, "rewards/accuracies": 0.0, "rewards/chosen": 3.727635622024536, "rewards/margins": -3.6191065311431885, "rewards/rejected": 7.346742153167725, "step": 3771 }, { "epoch": 0.83, "learning_rate": 6.554627642025807e-06, "logits/chosen": -1.2312301397323608, "logits/rejected": -1.179246187210083, "logps/chosen": -31.24907875061035, "logps/rejected": -45.197509765625, "loss": 1.3117, "rewards/accuracies": 0.0, "rewards/chosen": 1.6657098531723022, "rewards/margins": -2.5473837852478027, "rewards/rejected": 4.2130937576293945, "step": 3772 }, { "epoch": 0.84, "learning_rate": 6.552924052138362e-06, "logits/chosen": -2.026491403579712, "logits/rejected": -2.0159730911254883, "logps/chosen": -89.53631591796875, "logps/rejected": -122.08537292480469, "loss": 1.711, "rewards/accuracies": 0.0, "rewards/chosen": 6.991799831390381, "rewards/margins": -3.3864216804504395, "rewards/rejected": 10.37822151184082, "step": 3773 }, { "epoch": 0.84, "learning_rate": 6.551220262703909e-06, "logits/chosen": -1.6679787635803223, "logits/rejected": -1.6661627292633057, "logps/chosen": -108.45687103271484, "logps/rejected": -33.72906494140625, "loss": 0.718, "rewards/accuracies": 1.0, "rewards/chosen": 4.168416500091553, "rewards/margins": 1.5013816356658936, "rewards/rejected": 2.667034864425659, "step": 3774 }, { "epoch": 0.84, "learning_rate": 6.549516273941381e-06, "logits/chosen": -1.6141911745071411, "logits/rejected": -1.5593228340148926, "logps/chosen": -35.1690673828125, "logps/rejected": -23.732074737548828, "loss": 0.1288, "rewards/accuracies": 1.0, "rewards/chosen": 2.600351095199585, "rewards/margins": 1.228490948677063, "rewards/rejected": 1.371860146522522, "step": 3775 }, { "epoch": 0.84, "learning_rate": 6.547812086069736e-06, "logits/chosen": -1.8265986442565918, "logits/rejected": -1.8328286409378052, "logps/chosen": -25.934345245361328, "logps/rejected": -125.44935607910156, "loss": 4.3059, "rewards/accuracies": 0.0, "rewards/chosen": 2.456012725830078, "rewards/margins": -4.818666934967041, "rewards/rejected": 7.274679660797119, "step": 3776 }, { "epoch": 0.84, "learning_rate": 6.546107699307961e-06, "logits/chosen": -1.8870073556900024, "logits/rejected": -1.8231096267700195, "logps/chosen": -76.5265884399414, "logps/rejected": -48.47666549682617, "loss": 0.397, "rewards/accuracies": 0.0, "rewards/chosen": 2.4239342212677, "rewards/margins": -0.12630581855773926, "rewards/rejected": 2.5502400398254395, "step": 3777 }, { "epoch": 0.84, "learning_rate": 6.544403113875062e-06, "logits/chosen": -1.8138471841812134, "logits/rejected": -1.8235772848129272, "logps/chosen": -118.5780258178711, "logps/rejected": -101.2314453125, "loss": 1.1072, "rewards/accuracies": 0.0, "rewards/chosen": 4.846781253814697, "rewards/margins": -1.9056310653686523, "rewards/rejected": 6.75241231918335, "step": 3778 }, { "epoch": 0.84, "learning_rate": 6.542698329990075e-06, "logits/chosen": -1.2914955615997314, "logits/rejected": -1.1421250104904175, "logps/chosen": -41.158287048339844, "logps/rejected": -10.976557731628418, "loss": 1.4648, "rewards/accuracies": 1.0, "rewards/chosen": 2.8039650917053223, "rewards/margins": 1.4072694778442383, "rewards/rejected": 1.396695613861084, "step": 3779 }, { "epoch": 0.84, "learning_rate": 6.540993347872061e-06, "logits/chosen": -1.452895164489746, "logits/rejected": -1.4421874284744263, "logps/chosen": -64.26165008544922, "logps/rejected": -51.11177444458008, "loss": 0.4743, "rewards/accuracies": 0.0, "rewards/chosen": 2.3623757362365723, "rewards/margins": -0.3807528018951416, "rewards/rejected": 2.743128538131714, "step": 3780 }, { "epoch": 0.84, "learning_rate": 6.539288167740108e-06, "logits/chosen": -1.6795583963394165, "logits/rejected": -1.6294111013412476, "logps/chosen": -46.67752456665039, "logps/rejected": -83.05087280273438, "loss": 0.5629, "rewards/accuracies": 1.0, "rewards/chosen": 4.757590293884277, "rewards/margins": 2.23374080657959, "rewards/rejected": 2.5238494873046875, "step": 3781 }, { "epoch": 0.84, "learning_rate": 6.537582789813324e-06, "logits/chosen": -1.4786189794540405, "logits/rejected": -1.3589634895324707, "logps/chosen": -132.05404663085938, "logps/rejected": -84.6426010131836, "loss": 0.1991, "rewards/accuracies": 1.0, "rewards/chosen": 6.567474365234375, "rewards/margins": 0.9215035438537598, "rewards/rejected": 5.645970821380615, "step": 3782 }, { "epoch": 0.84, "learning_rate": 6.5358772143108485e-06, "logits/chosen": -1.7539490461349487, "logits/rejected": -1.7637436389923096, "logps/chosen": -78.48136901855469, "logps/rejected": -89.0823974609375, "loss": 0.1888, "rewards/accuracies": 1.0, "rewards/chosen": 4.58058500289917, "rewards/margins": 1.3809740543365479, "rewards/rejected": 3.199610948562622, "step": 3783 }, { "epoch": 0.84, "learning_rate": 6.534171441451843e-06, "logits/chosen": -1.6576911211013794, "logits/rejected": -1.679687261581421, "logps/chosen": -73.00233459472656, "logps/rejected": -95.79228210449219, "loss": 2.6229, "rewards/accuracies": 0.0, "rewards/chosen": 2.8519744873046875, "rewards/margins": -3.588618755340576, "rewards/rejected": 6.440593242645264, "step": 3784 }, { "epoch": 0.84, "learning_rate": 6.532465471455496e-06, "logits/chosen": -1.7599658966064453, "logits/rejected": -1.7599658966064453, "logps/chosen": -58.48582458496094, "logps/rejected": -58.48582458496094, "loss": 0.7338, "rewards/accuracies": 0.0, "rewards/chosen": 2.7055861949920654, "rewards/margins": 0.0, "rewards/rejected": 2.7055861949920654, "step": 3785 }, { "epoch": 0.84, "learning_rate": 6.53075930454102e-06, "logits/chosen": -1.380201816558838, "logits/rejected": -1.4094042778015137, "logps/chosen": -20.880413055419922, "logps/rejected": -35.94860076904297, "loss": 0.8344, "rewards/accuracies": 0.0, "rewards/chosen": 1.7796276807785034, "rewards/margins": -0.5025280714035034, "rewards/rejected": 2.282155752182007, "step": 3786 }, { "epoch": 0.84, "learning_rate": 6.529052940927652e-06, "logits/chosen": -1.693167805671692, "logits/rejected": -1.6717385053634644, "logps/chosen": -81.73956298828125, "logps/rejected": -74.26063537597656, "loss": 0.2696, "rewards/accuracies": 1.0, "rewards/chosen": 3.1175994873046875, "rewards/margins": 0.3918280601501465, "rewards/rejected": 2.725771427154541, "step": 3787 }, { "epoch": 0.84, "learning_rate": 6.527346380834657e-06, "logits/chosen": -1.7308778762817383, "logits/rejected": -1.6973754167556763, "logps/chosen": -55.634193420410156, "logps/rejected": -11.844486236572266, "loss": 0.1262, "rewards/accuracies": 1.0, "rewards/chosen": 5.482418060302734, "rewards/margins": 4.394396781921387, "rewards/rejected": 1.088021159172058, "step": 3788 }, { "epoch": 0.84, "learning_rate": 6.5256396244813235e-06, "logits/chosen": -1.84718656539917, "logits/rejected": -1.7883975505828857, "logps/chosen": -46.988731384277344, "logps/rejected": -15.765873908996582, "loss": 0.9558, "rewards/accuracies": 1.0, "rewards/chosen": 4.799112796783447, "rewards/margins": 2.794179916381836, "rewards/rejected": 2.0049328804016113, "step": 3789 }, { "epoch": 0.84, "learning_rate": 6.523932672086967e-06, "logits/chosen": -2.003648042678833, "logits/rejected": -1.9783424139022827, "logps/chosen": -74.03453826904297, "logps/rejected": -37.38035583496094, "loss": 0.1074, "rewards/accuracies": 1.0, "rewards/chosen": 2.693035125732422, "rewards/margins": 1.5293303728103638, "rewards/rejected": 1.163704752922058, "step": 3790 }, { "epoch": 0.84, "learning_rate": 6.522225523870924e-06, "logits/chosen": -1.6165618896484375, "logits/rejected": -1.6165618896484375, "logps/chosen": -49.56588363647461, "logps/rejected": -49.56588363647461, "loss": 0.375, "rewards/accuracies": 0.0, "rewards/chosen": 1.58063542842865, "rewards/margins": 0.0, "rewards/rejected": 1.58063542842865, "step": 3791 }, { "epoch": 0.84, "learning_rate": 6.520518180052562e-06, "logits/chosen": -1.8203637599945068, "logits/rejected": -1.8435745239257812, "logps/chosen": -38.839874267578125, "logps/rejected": -54.98418426513672, "loss": 1.1102, "rewards/accuracies": 0.0, "rewards/chosen": 3.5616424083709717, "rewards/margins": -0.023784637451171875, "rewards/rejected": 3.5854270458221436, "step": 3792 }, { "epoch": 0.84, "learning_rate": 6.518810640851269e-06, "logits/chosen": -1.5614157915115356, "logits/rejected": -1.5911189317703247, "logps/chosen": -18.623903274536133, "logps/rejected": -38.20027542114258, "loss": 0.8433, "rewards/accuracies": 0.0, "rewards/chosen": 1.0134273767471313, "rewards/margins": -1.3621395826339722, "rewards/rejected": 2.3755669593811035, "step": 3793 }, { "epoch": 0.84, "learning_rate": 6.517102906486459e-06, "logits/chosen": -1.3591135740280151, "logits/rejected": -1.3591135740280151, "logps/chosen": -27.477157592773438, "logps/rejected": -27.477157592773438, "loss": 0.4036, "rewards/accuracies": 0.0, "rewards/chosen": 3.3327274322509766, "rewards/margins": 0.0, "rewards/rejected": 3.3327274322509766, "step": 3794 }, { "epoch": 0.84, "learning_rate": 6.515394977177574e-06, "logits/chosen": -1.6649924516677856, "logits/rejected": -1.6337426900863647, "logps/chosen": -78.55818176269531, "logps/rejected": -107.63777160644531, "loss": 0.6052, "rewards/accuracies": 0.0, "rewards/chosen": 3.329113721847534, "rewards/margins": -0.07647109031677246, "rewards/rejected": 3.4055848121643066, "step": 3795 }, { "epoch": 0.84, "learning_rate": 6.513686853144076e-06, "logits/chosen": -1.7081485986709595, "logits/rejected": -1.6297495365142822, "logps/chosen": -105.34857177734375, "logps/rejected": -47.95332336425781, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 7.715297222137451, "rewards/margins": 4.073663711547852, "rewards/rejected": 3.6416337490081787, "step": 3796 }, { "epoch": 0.84, "learning_rate": 6.511978534605456e-06, "logits/chosen": -1.5715758800506592, "logits/rejected": -1.5775291919708252, "logps/chosen": -38.525146484375, "logps/rejected": -48.12981414794922, "loss": 2.9355, "rewards/accuracies": 0.0, "rewards/chosen": 2.5522377490997314, "rewards/margins": -0.25380706787109375, "rewards/rejected": 2.806044816970825, "step": 3797 }, { "epoch": 0.84, "learning_rate": 6.510270021781228e-06, "logits/chosen": -1.5542654991149902, "logits/rejected": -1.5749964714050293, "logps/chosen": -109.7231216430664, "logps/rejected": -50.67914581298828, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": 9.051207542419434, "rewards/margins": 2.2564616203308105, "rewards/rejected": 6.794745922088623, "step": 3798 }, { "epoch": 0.84, "learning_rate": 6.5085613148909345e-06, "logits/chosen": -1.4737366437911987, "logits/rejected": -1.4102275371551514, "logps/chosen": -41.79314422607422, "logps/rejected": -48.51941680908203, "loss": 0.6705, "rewards/accuracies": 0.0, "rewards/chosen": 1.3353302478790283, "rewards/margins": -0.9898827075958252, "rewards/rejected": 2.3252129554748535, "step": 3799 }, { "epoch": 0.84, "learning_rate": 6.506852414154138e-06, "logits/chosen": -1.6075935363769531, "logits/rejected": -1.5128151178359985, "logps/chosen": -47.804447174072266, "logps/rejected": -14.840721130371094, "loss": 0.1693, "rewards/accuracies": 1.0, "rewards/chosen": 1.8349239826202393, "rewards/margins": 1.0960012674331665, "rewards/rejected": 0.7389227151870728, "step": 3800 }, { "epoch": 0.84, "learning_rate": 6.505143319790428e-06, "logits/chosen": -1.172525405883789, "logits/rejected": -1.1576329469680786, "logps/chosen": -49.28272247314453, "logps/rejected": -45.437232971191406, "loss": 0.4347, "rewards/accuracies": 0.0, "rewards/chosen": 3.041175127029419, "rewards/margins": -0.16808772087097168, "rewards/rejected": 3.2092628479003906, "step": 3801 }, { "epoch": 0.84, "learning_rate": 6.503434032019421e-06, "logits/chosen": -1.6324244737625122, "logits/rejected": -1.6324244737625122, "logps/chosen": -23.87477684020996, "logps/rejected": -23.87477684020996, "loss": 0.8817, "rewards/accuracies": 0.0, "rewards/chosen": 2.6443357467651367, "rewards/margins": 0.0, "rewards/rejected": 2.6443357467651367, "step": 3802 }, { "epoch": 0.84, "learning_rate": 6.501724551060755e-06, "logits/chosen": -1.5559786558151245, "logits/rejected": -1.5424062013626099, "logps/chosen": -86.00212097167969, "logps/rejected": -57.161746978759766, "loss": 0.1204, "rewards/accuracies": 1.0, "rewards/chosen": 6.9385576248168945, "rewards/margins": 2.1024656295776367, "rewards/rejected": 4.836091995239258, "step": 3803 }, { "epoch": 0.84, "learning_rate": 6.500014877134094e-06, "logits/chosen": -1.76283597946167, "logits/rejected": -1.6766676902770996, "logps/chosen": -100.37419128417969, "logps/rejected": -71.21084594726562, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": 8.001347541809082, "rewards/margins": 3.7602334022521973, "rewards/rejected": 4.241114139556885, "step": 3804 }, { "epoch": 0.84, "learning_rate": 6.498305010459127e-06, "logits/chosen": -1.5412856340408325, "logits/rejected": -1.5386825799942017, "logps/chosen": -35.58928680419922, "logps/rejected": -53.67109680175781, "loss": 0.7735, "rewards/accuracies": 1.0, "rewards/chosen": 2.5622193813323975, "rewards/margins": 0.4266176223754883, "rewards/rejected": 2.135601758956909, "step": 3805 }, { "epoch": 0.84, "learning_rate": 6.496594951255568e-06, "logits/chosen": -1.9417463541030884, "logits/rejected": -1.9242043495178223, "logps/chosen": -84.05718994140625, "logps/rejected": -65.73128509521484, "loss": 1.3602, "rewards/accuracies": 0.0, "rewards/chosen": 5.6993560791015625, "rewards/margins": -1.7343454360961914, "rewards/rejected": 7.433701515197754, "step": 3806 }, { "epoch": 0.84, "learning_rate": 6.4948846997431545e-06, "logits/chosen": -1.5554643869400024, "logits/rejected": -1.5988832712173462, "logps/chosen": -150.7115478515625, "logps/rejected": -192.16702270507812, "loss": 3.2143, "rewards/accuracies": 0.0, "rewards/chosen": 5.749289035797119, "rewards/margins": -2.884631633758545, "rewards/rejected": 8.633920669555664, "step": 3807 }, { "epoch": 0.84, "learning_rate": 6.4931742561416524e-06, "logits/chosen": -1.5750718116760254, "logits/rejected": -1.5283652544021606, "logps/chosen": -68.90834045410156, "logps/rejected": -55.845069885253906, "loss": 0.4086, "rewards/accuracies": 1.0, "rewards/chosen": 2.931631565093994, "rewards/margins": 0.813758134841919, "rewards/rejected": 2.117873430252075, "step": 3808 }, { "epoch": 0.84, "learning_rate": 6.491463620670846e-06, "logits/chosen": -1.8095122575759888, "logits/rejected": -1.685632586479187, "logps/chosen": -137.6279296875, "logps/rejected": -77.65106201171875, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": 7.8538665771484375, "rewards/margins": 3.5012388229370117, "rewards/rejected": 4.352627754211426, "step": 3809 }, { "epoch": 0.84, "learning_rate": 6.489752793550551e-06, "logits/chosen": -1.8913469314575195, "logits/rejected": -1.8997477293014526, "logps/chosen": -39.23527526855469, "logps/rejected": -76.45248413085938, "loss": 0.283, "rewards/accuracies": 1.0, "rewards/chosen": 3.0534286499023438, "rewards/margins": 0.33384251594543457, "rewards/rejected": 2.719586133956909, "step": 3810 }, { "epoch": 0.84, "learning_rate": 6.488041775000604e-06, "logits/chosen": -1.695401668548584, "logits/rejected": -1.7031022310256958, "logps/chosen": -48.15532684326172, "logps/rejected": -55.67967987060547, "loss": 1.4637, "rewards/accuracies": 0.0, "rewards/chosen": 2.447772979736328, "rewards/margins": -2.6117568016052246, "rewards/rejected": 5.059529781341553, "step": 3811 }, { "epoch": 0.84, "learning_rate": 6.486330565240865e-06, "logits/chosen": -1.7442952394485474, "logits/rejected": -1.7073575258255005, "logps/chosen": -48.846588134765625, "logps/rejected": -58.96153259277344, "loss": 0.3384, "rewards/accuracies": 1.0, "rewards/chosen": 3.380633592605591, "rewards/margins": 0.13567113876342773, "rewards/rejected": 3.244962453842163, "step": 3812 }, { "epoch": 0.84, "learning_rate": 6.484619164491222e-06, "logits/chosen": -1.6196184158325195, "logits/rejected": -1.6366567611694336, "logps/chosen": -102.32373046875, "logps/rejected": -84.89470672607422, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 8.029359817504883, "rewards/margins": 4.584517478942871, "rewards/rejected": 3.444842576980591, "step": 3813 }, { "epoch": 0.84, "learning_rate": 6.482907572971584e-06, "logits/chosen": -1.6669682264328003, "logits/rejected": -1.5534076690673828, "logps/chosen": -65.1898193359375, "logps/rejected": -66.8733901977539, "loss": 0.175, "rewards/accuracies": 1.0, "rewards/chosen": 3.4268112182617188, "rewards/margins": 4.403966903686523, "rewards/rejected": -0.9771556854248047, "step": 3814 }, { "epoch": 0.84, "learning_rate": 6.4811957909018886e-06, "logits/chosen": -1.4443436861038208, "logits/rejected": -1.4198085069656372, "logps/chosen": -65.66670989990234, "logps/rejected": -39.80318832397461, "loss": 0.7116, "rewards/accuracies": 0.0, "rewards/chosen": 2.660372257232666, "rewards/margins": -1.1461946964263916, "rewards/rejected": 3.8065669536590576, "step": 3815 }, { "epoch": 0.84, "learning_rate": 6.479483818502095e-06, "logits/chosen": -1.815124273300171, "logits/rejected": -1.6525216102600098, "logps/chosen": -67.81321716308594, "logps/rejected": -18.272594451904297, "loss": 0.1191, "rewards/accuracies": 1.0, "rewards/chosen": 3.1597015857696533, "rewards/margins": 1.3367366790771484, "rewards/rejected": 1.8229649066925049, "step": 3816 }, { "epoch": 0.84, "learning_rate": 6.477771655992187e-06, "logits/chosen": -1.7542181015014648, "logits/rejected": -1.7703652381896973, "logps/chosen": -114.25630187988281, "logps/rejected": -52.905982971191406, "loss": 1.3547, "rewards/accuracies": 1.0, "rewards/chosen": 6.181556701660156, "rewards/margins": 0.24265336990356445, "rewards/rejected": 5.938903331756592, "step": 3817 }, { "epoch": 0.85, "learning_rate": 6.476059303592173e-06, "logits/chosen": -1.8611711263656616, "logits/rejected": -1.9168380498886108, "logps/chosen": -66.29045104980469, "logps/rejected": -96.27922058105469, "loss": 1.8015, "rewards/accuracies": 0.0, "rewards/chosen": 3.92936110496521, "rewards/margins": -3.5671494007110596, "rewards/rejected": 7.4965105056762695, "step": 3818 }, { "epoch": 0.85, "learning_rate": 6.474346761522088e-06, "logits/chosen": -1.7395927906036377, "logits/rejected": -1.6424092054367065, "logps/chosen": -75.84290313720703, "logps/rejected": -57.3520622253418, "loss": 1.5316, "rewards/accuracies": 1.0, "rewards/chosen": 4.682331085205078, "rewards/margins": 2.0714151859283447, "rewards/rejected": 2.6109158992767334, "step": 3819 }, { "epoch": 0.85, "learning_rate": 6.472634030001988e-06, "logits/chosen": -1.9667589664459229, "logits/rejected": -1.9793546199798584, "logps/chosen": -38.481727600097656, "logps/rejected": -51.68698501586914, "loss": 1.0006, "rewards/accuracies": 0.0, "rewards/chosen": 2.76741099357605, "rewards/margins": -1.1882848739624023, "rewards/rejected": 3.955695867538452, "step": 3820 }, { "epoch": 0.85, "learning_rate": 6.4709211092519554e-06, "logits/chosen": -1.708946704864502, "logits/rejected": -1.2985624074935913, "logps/chosen": -101.69068908691406, "logps/rejected": -27.523527145385742, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": 6.547255039215088, "rewards/margins": 3.719794988632202, "rewards/rejected": 2.8274600505828857, "step": 3821 }, { "epoch": 0.85, "learning_rate": 6.469207999492095e-06, "logits/chosen": -1.3111166954040527, "logits/rejected": -1.1390256881713867, "logps/chosen": -53.64869689941406, "logps/rejected": -19.35840606689453, "loss": 0.1171, "rewards/accuracies": 1.0, "rewards/chosen": 2.700875997543335, "rewards/margins": 2.695708990097046, "rewards/rejected": 0.0051670074462890625, "step": 3822 }, { "epoch": 0.85, "learning_rate": 6.4674947009425395e-06, "logits/chosen": -1.6289869546890259, "logits/rejected": -1.6470866203308105, "logps/chosen": -66.80158996582031, "logps/rejected": -92.23802185058594, "loss": 1.5186, "rewards/accuracies": 0.0, "rewards/chosen": 3.964956045150757, "rewards/margins": -2.935753583908081, "rewards/rejected": 6.900709629058838, "step": 3823 }, { "epoch": 0.85, "learning_rate": 6.4657812138234434e-06, "logits/chosen": -1.975502371788025, "logits/rejected": -1.9216928482055664, "logps/chosen": -58.01179504394531, "logps/rejected": -19.65346336364746, "loss": 0.0601, "rewards/accuracies": 1.0, "rewards/chosen": 2.664494276046753, "rewards/margins": 2.3557283878326416, "rewards/rejected": 0.30876579880714417, "step": 3824 }, { "epoch": 0.85, "learning_rate": 6.464067538354984e-06, "logits/chosen": -1.8252100944519043, "logits/rejected": -1.8166414499282837, "logps/chosen": -47.85261535644531, "logps/rejected": -68.28898620605469, "loss": 0.5254, "rewards/accuracies": 0.0, "rewards/chosen": 3.5981552600860596, "rewards/margins": -0.29193949699401855, "rewards/rejected": 3.890094757080078, "step": 3825 }, { "epoch": 0.85, "learning_rate": 6.462353674757366e-06, "logits/chosen": -1.7670432329177856, "logits/rejected": -1.940312147140503, "logps/chosen": -58.70675277709961, "logps/rejected": -37.48968505859375, "loss": 0.606, "rewards/accuracies": 1.0, "rewards/chosen": 2.1889195442199707, "rewards/margins": 0.5924935340881348, "rewards/rejected": 1.596426010131836, "step": 3826 }, { "epoch": 0.85, "learning_rate": 6.460639623250815e-06, "logits/chosen": -1.6279376745224, "logits/rejected": -1.6355793476104736, "logps/chosen": -98.90400695800781, "logps/rejected": -108.20736694335938, "loss": 0.6749, "rewards/accuracies": 0.0, "rewards/chosen": 5.486912727355957, "rewards/margins": -0.9713821411132812, "rewards/rejected": 6.458294868469238, "step": 3827 }, { "epoch": 0.85, "learning_rate": 6.4589253840555856e-06, "logits/chosen": -1.9632782936096191, "logits/rejected": -1.980049967765808, "logps/chosen": -32.983863830566406, "logps/rejected": -67.13294982910156, "loss": 0.7817, "rewards/accuracies": 0.0, "rewards/chosen": 2.175722599029541, "rewards/margins": -1.2790138721466064, "rewards/rejected": 3.4547364711761475, "step": 3828 }, { "epoch": 0.85, "learning_rate": 6.4572109573919505e-06, "logits/chosen": -1.6416302919387817, "logits/rejected": -1.552289366722107, "logps/chosen": -34.35430145263672, "logps/rejected": -26.125253677368164, "loss": 1.5343, "rewards/accuracies": 1.0, "rewards/chosen": 2.513076066970825, "rewards/margins": 0.9848947525024414, "rewards/rejected": 1.5281813144683838, "step": 3829 }, { "epoch": 0.85, "learning_rate": 6.455496343480211e-06, "logits/chosen": -1.6940122842788696, "logits/rejected": -1.6062248945236206, "logps/chosen": -80.58131408691406, "logps/rejected": -76.95759582519531, "loss": 1.4887, "rewards/accuracies": 1.0, "rewards/chosen": 4.773521423339844, "rewards/margins": 2.984196424484253, "rewards/rejected": 1.7893249988555908, "step": 3830 }, { "epoch": 0.85, "learning_rate": 6.453781542540689e-06, "logits/chosen": -1.781510591506958, "logits/rejected": -1.7540910243988037, "logps/chosen": -37.377296447753906, "logps/rejected": -43.0822868347168, "loss": 1.6098, "rewards/accuracies": 1.0, "rewards/chosen": 3.164283037185669, "rewards/margins": 0.12760663032531738, "rewards/rejected": 3.0366764068603516, "step": 3831 }, { "epoch": 0.85, "learning_rate": 6.452066554793734e-06, "logits/chosen": -1.709944248199463, "logits/rejected": -1.57398521900177, "logps/chosen": -139.70254516601562, "logps/rejected": -33.530113220214844, "loss": 1.4664, "rewards/accuracies": 1.0, "rewards/chosen": 6.676011562347412, "rewards/margins": 5.340780258178711, "rewards/rejected": 1.335231065750122, "step": 3832 }, { "epoch": 0.85, "learning_rate": 6.450351380459717e-06, "logits/chosen": -1.9193063974380493, "logits/rejected": -1.8891677856445312, "logps/chosen": -39.35642623901367, "logps/rejected": -67.23284149169922, "loss": 0.9157, "rewards/accuracies": 0.0, "rewards/chosen": 2.744530200958252, "rewards/margins": -1.1571552753448486, "rewards/rejected": 3.9016854763031006, "step": 3833 }, { "epoch": 0.85, "learning_rate": 6.448636019759036e-06, "logits/chosen": -1.6638246774673462, "logits/rejected": -1.6559898853302002, "logps/chosen": -56.16290283203125, "logps/rejected": -68.23307800292969, "loss": 0.4183, "rewards/accuracies": 0.0, "rewards/chosen": 2.5790512561798096, "rewards/margins": -0.26770472526550293, "rewards/rejected": 2.8467559814453125, "step": 3834 }, { "epoch": 0.85, "learning_rate": 6.4469204729121086e-06, "logits/chosen": -1.9148197174072266, "logits/rejected": -1.5861046314239502, "logps/chosen": -52.86508560180664, "logps/rejected": -31.018142700195312, "loss": 0.1754, "rewards/accuracies": 1.0, "rewards/chosen": 4.008695602416992, "rewards/margins": 0.8876864910125732, "rewards/rejected": 3.121009111404419, "step": 3835 }, { "epoch": 0.85, "learning_rate": 6.445204740139377e-06, "logits/chosen": -1.5598331689834595, "logits/rejected": -1.508787751197815, "logps/chosen": -92.6753921508789, "logps/rejected": -60.6572265625, "loss": 0.1706, "rewards/accuracies": 1.0, "rewards/chosen": 6.263829708099365, "rewards/margins": 3.3682210445404053, "rewards/rejected": 2.89560866355896, "step": 3836 }, { "epoch": 0.85, "learning_rate": 6.443488821661312e-06, "logits/chosen": -1.8631927967071533, "logits/rejected": -1.9016392230987549, "logps/chosen": -128.4169464111328, "logps/rejected": -102.04005432128906, "loss": 0.296, "rewards/accuracies": 1.0, "rewards/chosen": 8.669090270996094, "rewards/margins": 0.22834110260009766, "rewards/rejected": 8.440749168395996, "step": 3837 }, { "epoch": 0.85, "learning_rate": 6.441772717698402e-06, "logits/chosen": -1.8285109996795654, "logits/rejected": -1.7019119262695312, "logps/chosen": -151.12884521484375, "logps/rejected": -44.98008728027344, "loss": 0.608, "rewards/accuracies": 1.0, "rewards/chosen": 6.327136516571045, "rewards/margins": 1.157911777496338, "rewards/rejected": 5.169224739074707, "step": 3838 }, { "epoch": 0.85, "learning_rate": 6.440056428471164e-06, "logits/chosen": -1.5388044118881226, "logits/rejected": -1.5507339239120483, "logps/chosen": -87.507080078125, "logps/rejected": -72.06512451171875, "loss": 1.2929, "rewards/accuracies": 0.0, "rewards/chosen": 2.564300537109375, "rewards/margins": -1.2504775524139404, "rewards/rejected": 3.8147780895233154, "step": 3839 }, { "epoch": 0.85, "learning_rate": 6.4383399542001375e-06, "logits/chosen": -1.9002572298049927, "logits/rejected": -0.5528983473777771, "logps/chosen": -63.37070083618164, "logps/rejected": -66.26351928710938, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 4.768788814544678, "rewards/margins": 3.8330678939819336, "rewards/rejected": 0.9357208609580994, "step": 3840 }, { "epoch": 0.85, "learning_rate": 6.436623295105884e-06, "logits/chosen": -1.8225414752960205, "logits/rejected": -1.8198221921920776, "logps/chosen": -70.03753662109375, "logps/rejected": -60.344322204589844, "loss": 2.0113, "rewards/accuracies": 0.0, "rewards/chosen": 6.217355251312256, "rewards/margins": -1.870622158050537, "rewards/rejected": 8.087977409362793, "step": 3841 }, { "epoch": 0.85, "learning_rate": 6.434906451408991e-06, "logits/chosen": -1.669232726097107, "logits/rejected": -1.2442768812179565, "logps/chosen": -92.44944763183594, "logps/rejected": -38.82444763183594, "loss": 0.4605, "rewards/accuracies": 0.0, "rewards/chosen": 2.4740586280822754, "rewards/margins": -0.006983518600463867, "rewards/rejected": 2.4810421466827393, "step": 3842 }, { "epoch": 0.85, "learning_rate": 6.433189423330068e-06, "logits/chosen": -1.7707765102386475, "logits/rejected": -1.7422916889190674, "logps/chosen": -57.92655944824219, "logps/rejected": -57.51413345336914, "loss": 0.8531, "rewards/accuracies": 0.0, "rewards/chosen": 1.9929932355880737, "rewards/margins": -0.861026406288147, "rewards/rejected": 2.8540196418762207, "step": 3843 }, { "epoch": 0.85, "learning_rate": 6.431472211089751e-06, "logits/chosen": -1.9294484853744507, "logits/rejected": -1.9541702270507812, "logps/chosen": -54.438655853271484, "logps/rejected": -59.8889045715332, "loss": 0.6541, "rewards/accuracies": 0.0, "rewards/chosen": 2.849137544631958, "rewards/margins": -0.9742386341094971, "rewards/rejected": 3.823376178741455, "step": 3844 }, { "epoch": 0.85, "learning_rate": 6.429754814908695e-06, "logits/chosen": -1.9136583805084229, "logits/rejected": -1.9119067192077637, "logps/chosen": -82.42086791992188, "logps/rejected": -66.80525970458984, "loss": 0.3758, "rewards/accuracies": 0.0, "rewards/chosen": 4.378730297088623, "rewards/margins": -0.01511383056640625, "rewards/rejected": 4.393844127655029, "step": 3845 }, { "epoch": 0.85, "learning_rate": 6.428037235007582e-06, "logits/chosen": -1.643275499343872, "logits/rejected": -1.5728837251663208, "logps/chosen": -105.46011352539062, "logps/rejected": -79.69096374511719, "loss": 0.1676, "rewards/accuracies": 1.0, "rewards/chosen": 6.501309394836426, "rewards/margins": 0.9556689262390137, "rewards/rejected": 5.545640468597412, "step": 3846 }, { "epoch": 0.85, "learning_rate": 6.4263194716071174e-06, "logits/chosen": -1.9150390625, "logits/rejected": -1.508034586906433, "logps/chosen": -66.62523651123047, "logps/rejected": -61.31817626953125, "loss": 1.0256, "rewards/accuracies": 1.0, "rewards/chosen": 2.475043535232544, "rewards/margins": 0.09890222549438477, "rewards/rejected": 2.376141309738159, "step": 3847 }, { "epoch": 0.85, "learning_rate": 6.424601524928029e-06, "logits/chosen": -1.5578563213348389, "logits/rejected": -1.5638777017593384, "logps/chosen": -11.267267227172852, "logps/rejected": -10.679448127746582, "loss": 0.4175, "rewards/accuracies": 0.0, "rewards/chosen": 1.3713502883911133, "rewards/margins": -0.17132043838500977, "rewards/rejected": 1.542670726776123, "step": 3848 }, { "epoch": 0.85, "learning_rate": 6.422883395191069e-06, "logits/chosen": -1.7959610223770142, "logits/rejected": -1.7338989973068237, "logps/chosen": -58.71852111816406, "logps/rejected": -63.72618103027344, "loss": 0.6474, "rewards/accuracies": 0.0, "rewards/chosen": 2.284802198410034, "rewards/margins": -0.7155869007110596, "rewards/rejected": 3.0003890991210938, "step": 3849 }, { "epoch": 0.85, "learning_rate": 6.421165082617013e-06, "logits/chosen": -1.4688270092010498, "logits/rejected": -1.3959933519363403, "logps/chosen": -111.52478790283203, "logps/rejected": -114.72351837158203, "loss": 0.0933, "rewards/accuracies": 1.0, "rewards/chosen": 8.524270057678223, "rewards/margins": 1.595689296722412, "rewards/rejected": 6.9285807609558105, "step": 3850 }, { "epoch": 0.85, "learning_rate": 6.419446587426662e-06, "logits/chosen": -1.401703953742981, "logits/rejected": -1.393128514289856, "logps/chosen": -43.717952728271484, "logps/rejected": -68.34339904785156, "loss": 1.2534, "rewards/accuracies": 0.0, "rewards/chosen": 1.5716114044189453, "rewards/margins": -2.4202816486358643, "rewards/rejected": 3.9918930530548096, "step": 3851 }, { "epoch": 0.85, "learning_rate": 6.417727909840836e-06, "logits/chosen": -1.242004156112671, "logits/rejected": -1.2746145725250244, "logps/chosen": -47.92280578613281, "logps/rejected": -66.45333862304688, "loss": 0.711, "rewards/accuracies": 0.0, "rewards/chosen": 3.940812826156616, "rewards/margins": -0.52024245262146, "rewards/rejected": 4.461055278778076, "step": 3852 }, { "epoch": 0.85, "learning_rate": 6.416009050080383e-06, "logits/chosen": -1.9169135093688965, "logits/rejected": -1.9181499481201172, "logps/chosen": -52.62053680419922, "logps/rejected": -20.60401725769043, "loss": 0.8117, "rewards/accuracies": 1.0, "rewards/chosen": 1.7032517194747925, "rewards/margins": 1.0739240646362305, "rewards/rejected": 0.6293275952339172, "step": 3853 }, { "epoch": 0.85, "learning_rate": 6.414290008366169e-06, "logits/chosen": -1.8472529649734497, "logits/rejected": -1.6774593591690063, "logps/chosen": -84.02949523925781, "logps/rejected": -10.055773735046387, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": 4.569947719573975, "rewards/margins": 3.5219030380249023, "rewards/rejected": 1.0480446815490723, "step": 3854 }, { "epoch": 0.85, "learning_rate": 6.41257078491909e-06, "logits/chosen": -1.5683525800704956, "logits/rejected": -1.5494180917739868, "logps/chosen": -47.48512268066406, "logps/rejected": -45.21461486816406, "loss": 0.4878, "rewards/accuracies": 0.0, "rewards/chosen": 3.115408420562744, "rewards/margins": -0.06709885597229004, "rewards/rejected": 3.182507276535034, "step": 3855 }, { "epoch": 0.85, "learning_rate": 6.41085137996006e-06, "logits/chosen": -1.5658159255981445, "logits/rejected": -1.5634089708328247, "logps/chosen": -129.4298553466797, "logps/rejected": -107.48365020751953, "loss": 0.6309, "rewards/accuracies": 0.0, "rewards/chosen": 5.079982280731201, "rewards/margins": -0.852567195892334, "rewards/rejected": 5.932549476623535, "step": 3856 }, { "epoch": 0.85, "learning_rate": 6.409131793710019e-06, "logits/chosen": -1.8710979223251343, "logits/rejected": -1.9836535453796387, "logps/chosen": -68.13102722167969, "logps/rejected": -140.32395935058594, "loss": 3.2908, "rewards/accuracies": 0.0, "rewards/chosen": 3.5725343227386475, "rewards/margins": -6.5620222091674805, "rewards/rejected": 10.134556770324707, "step": 3857 }, { "epoch": 0.85, "learning_rate": 6.407412026389931e-06, "logits/chosen": -1.5034550428390503, "logits/rejected": -1.356268286705017, "logps/chosen": -60.03710174560547, "logps/rejected": -22.110919952392578, "loss": 1.1589, "rewards/accuracies": 0.0, "rewards/chosen": 2.8024253845214844, "rewards/margins": -0.54306960105896, "rewards/rejected": 3.3454949855804443, "step": 3858 }, { "epoch": 0.85, "learning_rate": 6.40569207822078e-06, "logits/chosen": -1.553836464881897, "logits/rejected": -1.6012542247772217, "logps/chosen": -45.202613830566406, "logps/rejected": -99.12625122070312, "loss": 1.9258, "rewards/accuracies": 0.0, "rewards/chosen": 3.0747992992401123, "rewards/margins": -3.1151115894317627, "rewards/rejected": 6.189910888671875, "step": 3859 }, { "epoch": 0.85, "learning_rate": 6.403971949423577e-06, "logits/chosen": -1.9159972667694092, "logits/rejected": -1.8670156002044678, "logps/chosen": -72.37211608886719, "logps/rejected": -47.6531982421875, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": 7.036018371582031, "rewards/margins": 3.596235513687134, "rewards/rejected": 3.4397828578948975, "step": 3860 }, { "epoch": 0.85, "learning_rate": 6.402251640219353e-06, "logits/chosen": -1.8157926797866821, "logits/rejected": -1.8467392921447754, "logps/chosen": -62.04258728027344, "logps/rejected": -80.41357421875, "loss": 0.3573, "rewards/accuracies": 1.0, "rewards/chosen": 3.408742666244507, "rewards/margins": 1.3210976123809814, "rewards/rejected": 2.0876450538635254, "step": 3861 }, { "epoch": 0.85, "learning_rate": 6.400531150829165e-06, "logits/chosen": -1.5236976146697998, "logits/rejected": -1.4662197828292847, "logps/chosen": -120.85430908203125, "logps/rejected": -51.009613037109375, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": 7.213342189788818, "rewards/margins": 3.8224425315856934, "rewards/rejected": 3.390899658203125, "step": 3862 }, { "epoch": 0.86, "learning_rate": 6.39881048147409e-06, "logits/chosen": -1.4723857641220093, "logits/rejected": -1.283433437347412, "logps/chosen": -96.02519226074219, "logps/rejected": -34.15171813964844, "loss": 2.0222, "rewards/accuracies": 1.0, "rewards/chosen": 8.603865623474121, "rewards/margins": 4.710681915283203, "rewards/rejected": 3.893183946609497, "step": 3863 }, { "epoch": 0.86, "learning_rate": 6.397089632375231e-06, "logits/chosen": -1.5927103757858276, "logits/rejected": -1.5289891958236694, "logps/chosen": -158.54916381835938, "logps/rejected": -93.94468688964844, "loss": 0.2327, "rewards/accuracies": 1.0, "rewards/chosen": 6.700109958648682, "rewards/margins": 4.6311845779418945, "rewards/rejected": 2.068925619125366, "step": 3864 }, { "epoch": 0.86, "learning_rate": 6.395368603753713e-06, "logits/chosen": -1.4243510961532593, "logits/rejected": -1.3221081495285034, "logps/chosen": -57.2314453125, "logps/rejected": -46.5005989074707, "loss": 0.1966, "rewards/accuracies": 1.0, "rewards/chosen": 3.146135091781616, "rewards/margins": 0.8757023811340332, "rewards/rejected": 2.270432710647583, "step": 3865 }, { "epoch": 0.86, "learning_rate": 6.3936473958306845e-06, "logits/chosen": -1.828946590423584, "logits/rejected": -1.786501169204712, "logps/chosen": -69.39515686035156, "logps/rejected": -61.0631103515625, "loss": 0.6525, "rewards/accuracies": 0.0, "rewards/chosen": 3.2201950550079346, "rewards/margins": -0.931326150894165, "rewards/rejected": 4.1515212059021, "step": 3866 }, { "epoch": 0.86, "learning_rate": 6.391926008827314e-06, "logits/chosen": -1.528879165649414, "logits/rejected": -1.401158332824707, "logps/chosen": -95.92739868164062, "logps/rejected": -59.310279846191406, "loss": 0.1645, "rewards/accuracies": 1.0, "rewards/chosen": 6.227261543273926, "rewards/margins": 3.364675283432007, "rewards/rejected": 2.862586259841919, "step": 3867 }, { "epoch": 0.86, "learning_rate": 6.390204442964798e-06, "logits/chosen": -1.7052916288375854, "logits/rejected": -1.491136074066162, "logps/chosen": -184.47845458984375, "logps/rejected": -98.69635009765625, "loss": 1.5515, "rewards/accuracies": 0.0, "rewards/chosen": 5.177189826965332, "rewards/margins": -1.1355695724487305, "rewards/rejected": 6.3127593994140625, "step": 3868 }, { "epoch": 0.86, "learning_rate": 6.388482698464353e-06, "logits/chosen": -1.5848724842071533, "logits/rejected": -1.5383480787277222, "logps/chosen": -55.55060577392578, "logps/rejected": -36.547359466552734, "loss": 0.5377, "rewards/accuracies": 0.0, "rewards/chosen": 3.3949639797210693, "rewards/margins": -0.6517465114593506, "rewards/rejected": 4.04671049118042, "step": 3869 }, { "epoch": 0.86, "learning_rate": 6.386760775547221e-06, "logits/chosen": -1.687343716621399, "logits/rejected": -1.647499680519104, "logps/chosen": -145.54327392578125, "logps/rejected": -82.49589538574219, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": 9.026168823242188, "rewards/margins": 1.655073642730713, "rewards/rejected": 7.371095180511475, "step": 3870 }, { "epoch": 0.86, "learning_rate": 6.385038674434662e-06, "logits/chosen": -1.595773458480835, "logits/rejected": -1.5722086429595947, "logps/chosen": -43.6259765625, "logps/rejected": -56.06450653076172, "loss": 0.3523, "rewards/accuracies": 1.0, "rewards/chosen": 2.603353261947632, "rewards/margins": 0.37749266624450684, "rewards/rejected": 2.225860595703125, "step": 3871 }, { "epoch": 0.86, "learning_rate": 6.383316395347962e-06, "logits/chosen": -1.8124568462371826, "logits/rejected": -1.8081023693084717, "logps/chosen": -39.00870895385742, "logps/rejected": -29.689823150634766, "loss": 0.473, "rewards/accuracies": 0.0, "rewards/chosen": 2.1845920085906982, "rewards/margins": -0.20557761192321777, "rewards/rejected": 2.390169620513916, "step": 3872 }, { "epoch": 0.86, "learning_rate": 6.3815939385084305e-06, "logits/chosen": -1.7723209857940674, "logits/rejected": -1.7761081457138062, "logps/chosen": -23.053325653076172, "logps/rejected": -60.09526443481445, "loss": 1.7817, "rewards/accuracies": 0.0, "rewards/chosen": 1.7768490314483643, "rewards/margins": -1.1927313804626465, "rewards/rejected": 2.9695804119110107, "step": 3873 }, { "epoch": 0.86, "learning_rate": 6.3798713041373995e-06, "logits/chosen": -1.960474967956543, "logits/rejected": -1.9456846714019775, "logps/chosen": -91.29470825195312, "logps/rejected": -69.12178039550781, "loss": 0.3607, "rewards/accuracies": 1.0, "rewards/chosen": 6.698060512542725, "rewards/margins": 2.244058132171631, "rewards/rejected": 4.454002380371094, "step": 3874 }, { "epoch": 0.86, "learning_rate": 6.378148492456224e-06, "logits/chosen": -1.6264243125915527, "logits/rejected": -1.4645919799804688, "logps/chosen": -52.136905670166016, "logps/rejected": -22.919872283935547, "loss": 0.2257, "rewards/accuracies": 1.0, "rewards/chosen": 2.827360153198242, "rewards/margins": 1.3657348155975342, "rewards/rejected": 1.461625337600708, "step": 3875 }, { "epoch": 0.86, "learning_rate": 6.376425503686279e-06, "logits/chosen": -1.418586254119873, "logits/rejected": -1.4167473316192627, "logps/chosen": -36.66212463378906, "logps/rejected": -45.03644943237305, "loss": 1.6151, "rewards/accuracies": 0.0, "rewards/chosen": 2.921506643295288, "rewards/margins": -1.6646268367767334, "rewards/rejected": 4.5861334800720215, "step": 3876 }, { "epoch": 0.86, "learning_rate": 6.374702338048966e-06, "logits/chosen": -1.7365056276321411, "logits/rejected": -1.7498400211334229, "logps/chosen": -106.50362396240234, "logps/rejected": -80.77992248535156, "loss": 0.1921, "rewards/accuracies": 1.0, "rewards/chosen": 7.215409278869629, "rewards/margins": 1.3943476676940918, "rewards/rejected": 5.821061611175537, "step": 3877 }, { "epoch": 0.86, "learning_rate": 6.372978995765708e-06, "logits/chosen": -1.693413496017456, "logits/rejected": -1.6549139022827148, "logps/chosen": -56.69380569458008, "logps/rejected": -53.48297119140625, "loss": 0.2754, "rewards/accuracies": 1.0, "rewards/chosen": 3.0864315032958984, "rewards/margins": 0.31279706954956055, "rewards/rejected": 2.773634433746338, "step": 3878 }, { "epoch": 0.86, "learning_rate": 6.371255477057949e-06, "logits/chosen": -1.6593207120895386, "logits/rejected": -1.6677778959274292, "logps/chosen": -83.78701782226562, "logps/rejected": -65.78802490234375, "loss": 0.2984, "rewards/accuracies": 1.0, "rewards/chosen": 5.84047269821167, "rewards/margins": 0.28211307525634766, "rewards/rejected": 5.558359622955322, "step": 3879 }, { "epoch": 0.86, "learning_rate": 6.369531782147157e-06, "logits/chosen": -1.774233102798462, "logits/rejected": -1.7154701948165894, "logps/chosen": -102.3851318359375, "logps/rejected": -51.39720916748047, "loss": 0.1605, "rewards/accuracies": 1.0, "rewards/chosen": 5.982480049133301, "rewards/margins": 1.4226226806640625, "rewards/rejected": 4.559857368469238, "step": 3880 }, { "epoch": 0.86, "learning_rate": 6.367807911254824e-06, "logits/chosen": -1.6341255903244019, "logits/rejected": -1.5785764455795288, "logps/chosen": -76.73521423339844, "logps/rejected": -50.123077392578125, "loss": 0.2238, "rewards/accuracies": 1.0, "rewards/chosen": 4.368963718414307, "rewards/margins": 0.8586540222167969, "rewards/rejected": 3.5103096961975098, "step": 3881 }, { "epoch": 0.86, "learning_rate": 6.366083864602461e-06, "logits/chosen": -1.7649914026260376, "logits/rejected": -1.6991920471191406, "logps/chosen": -88.35313415527344, "logps/rejected": -49.74837112426758, "loss": 0.1109, "rewards/accuracies": 1.0, "rewards/chosen": 8.017985343933105, "rewards/margins": 4.890295505523682, "rewards/rejected": 3.127689838409424, "step": 3882 }, { "epoch": 0.86, "learning_rate": 6.3643596424116075e-06, "logits/chosen": -1.5458406209945679, "logits/rejected": -1.5450955629348755, "logps/chosen": -111.75151062011719, "logps/rejected": -74.62478637695312, "loss": 1.0367, "rewards/accuracies": 0.0, "rewards/chosen": 5.486274719238281, "rewards/margins": -1.5389132499694824, "rewards/rejected": 7.025187969207764, "step": 3883 }, { "epoch": 0.86, "learning_rate": 6.362635244903818e-06, "logits/chosen": -1.8248696327209473, "logits/rejected": -1.6668621301651, "logps/chosen": -94.77400207519531, "logps/rejected": -27.092679977416992, "loss": 0.4727, "rewards/accuracies": 1.0, "rewards/chosen": 8.893104553222656, "rewards/margins": 7.350552558898926, "rewards/rejected": 1.5425519943237305, "step": 3884 }, { "epoch": 0.86, "learning_rate": 6.360910672300677e-06, "logits/chosen": -1.8921375274658203, "logits/rejected": -1.8291683197021484, "logps/chosen": -138.7802276611328, "logps/rejected": -26.25666046142578, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 8.377677917480469, "rewards/margins": 4.394593715667725, "rewards/rejected": 3.983084201812744, "step": 3885 }, { "epoch": 0.86, "learning_rate": 6.3591859248237855e-06, "logits/chosen": -1.5137513875961304, "logits/rejected": -1.3950313329696655, "logps/chosen": -79.56001281738281, "logps/rejected": -52.957733154296875, "loss": 0.2096, "rewards/accuracies": 1.0, "rewards/chosen": 3.7029054164886475, "rewards/margins": 0.7580444812774658, "rewards/rejected": 2.9448609352111816, "step": 3886 }, { "epoch": 0.86, "learning_rate": 6.357461002694771e-06, "logits/chosen": -1.8948227167129517, "logits/rejected": -1.8375717401504517, "logps/chosen": -92.54521179199219, "logps/rejected": -37.94425582885742, "loss": 0.5467, "rewards/accuracies": 0.0, "rewards/chosen": 2.8996522426605225, "rewards/margins": -0.5882570743560791, "rewards/rejected": 3.4879093170166016, "step": 3887 }, { "epoch": 0.86, "learning_rate": 6.355735906135278e-06, "logits/chosen": -1.6301491260528564, "logits/rejected": -1.6262010335922241, "logps/chosen": -63.193172454833984, "logps/rejected": -44.0168342590332, "loss": 0.649, "rewards/accuracies": 1.0, "rewards/chosen": 3.262521743774414, "rewards/margins": 0.4476759433746338, "rewards/rejected": 2.8148458003997803, "step": 3888 }, { "epoch": 0.86, "learning_rate": 6.354010635366983e-06, "logits/chosen": -1.7457672357559204, "logits/rejected": -1.7466627359390259, "logps/chosen": -44.17143249511719, "logps/rejected": -46.274559020996094, "loss": 0.427, "rewards/accuracies": 0.0, "rewards/chosen": 1.751592993736267, "rewards/margins": -0.29765164852142334, "rewards/rejected": 2.0492446422576904, "step": 3889 }, { "epoch": 0.86, "learning_rate": 6.352285190611575e-06, "logits/chosen": -1.719005823135376, "logits/rejected": -1.674560308456421, "logps/chosen": -48.73458480834961, "logps/rejected": -49.19492721557617, "loss": 0.2672, "rewards/accuracies": 1.0, "rewards/chosen": 2.9073407649993896, "rewards/margins": 0.5128135681152344, "rewards/rejected": 2.3945271968841553, "step": 3890 }, { "epoch": 0.86, "learning_rate": 6.350559572090771e-06, "logits/chosen": -1.834556221961975, "logits/rejected": -1.8393388986587524, "logps/chosen": -82.37455749511719, "logps/rejected": -68.48401641845703, "loss": 0.6688, "rewards/accuracies": 0.0, "rewards/chosen": 2.5434250831604004, "rewards/margins": -1.0173003673553467, "rewards/rejected": 3.560725450515747, "step": 3891 }, { "epoch": 0.86, "learning_rate": 6.348833780026307e-06, "logits/chosen": -1.5909655094146729, "logits/rejected": -1.5798474550247192, "logps/chosen": -53.21409225463867, "logps/rejected": -68.02508544921875, "loss": 1.0946, "rewards/accuracies": 0.0, "rewards/chosen": 1.9262112379074097, "rewards/margins": -2.0572214126586914, "rewards/rejected": 3.9834327697753906, "step": 3892 }, { "epoch": 0.86, "learning_rate": 6.3471078146399465e-06, "logits/chosen": -1.4513494968414307, "logits/rejected": -1.4321985244750977, "logps/chosen": -99.83694458007812, "logps/rejected": -89.42678833007812, "loss": 0.3237, "rewards/accuracies": 1.0, "rewards/chosen": 3.577711582183838, "rewards/margins": 0.09779047966003418, "rewards/rejected": 3.4799211025238037, "step": 3893 }, { "epoch": 0.86, "learning_rate": 6.345381676153472e-06, "logits/chosen": -1.9559895992279053, "logits/rejected": -1.9363749027252197, "logps/chosen": -79.2679672241211, "logps/rejected": -77.81350708007812, "loss": 0.6413, "rewards/accuracies": 0.0, "rewards/chosen": 3.7123947143554688, "rewards/margins": -0.9279036521911621, "rewards/rejected": 4.640298366546631, "step": 3894 }, { "epoch": 0.86, "learning_rate": 6.343655364788684e-06, "logits/chosen": -1.7837071418762207, "logits/rejected": -1.75029718875885, "logps/chosen": -96.55721282958984, "logps/rejected": -85.296875, "loss": 0.5202, "rewards/accuracies": 0.0, "rewards/chosen": 7.376126766204834, "rewards/margins": -0.14048385620117188, "rewards/rejected": 7.516610622406006, "step": 3895 }, { "epoch": 0.86, "learning_rate": 6.341928880767413e-06, "logits/chosen": -1.6317607164382935, "logits/rejected": -1.6038326025009155, "logps/chosen": -87.24723052978516, "logps/rejected": -56.810890197753906, "loss": 1.1184, "rewards/accuracies": 0.0, "rewards/chosen": 3.940333604812622, "rewards/margins": -1.3260581493377686, "rewards/rejected": 5.266391754150391, "step": 3896 }, { "epoch": 0.86, "learning_rate": 6.340202224311504e-06, "logits/chosen": -1.626207709312439, "logits/rejected": -1.58282470703125, "logps/chosen": -64.13648986816406, "logps/rejected": -51.11140823364258, "loss": 1.1841, "rewards/accuracies": 0.0, "rewards/chosen": 2.992880344390869, "rewards/margins": -1.8014163970947266, "rewards/rejected": 4.794296741485596, "step": 3897 }, { "epoch": 0.86, "learning_rate": 6.338475395642834e-06, "logits/chosen": -1.8095886707305908, "logits/rejected": -1.7449458837509155, "logps/chosen": -73.53486633300781, "logps/rejected": -34.92604064941406, "loss": 0.8684, "rewards/accuracies": 1.0, "rewards/chosen": 5.310451030731201, "rewards/margins": 1.5744884014129639, "rewards/rejected": 3.7359626293182373, "step": 3898 }, { "epoch": 0.86, "learning_rate": 6.3367483949832916e-06, "logits/chosen": -1.6794812679290771, "logits/rejected": -1.7025073766708374, "logps/chosen": -103.11467742919922, "logps/rejected": -46.21464538574219, "loss": 0.4723, "rewards/accuracies": 1.0, "rewards/chosen": 5.062661170959473, "rewards/margins": 0.14115667343139648, "rewards/rejected": 4.921504497528076, "step": 3899 }, { "epoch": 0.86, "learning_rate": 6.3350212225547935e-06, "logits/chosen": -1.5630614757537842, "logits/rejected": -1.5437185764312744, "logps/chosen": -51.094078063964844, "logps/rejected": -38.087852478027344, "loss": 0.4017, "rewards/accuracies": 0.0, "rewards/chosen": 1.8444229364395142, "rewards/margins": -0.14650380611419678, "rewards/rejected": 1.990926742553711, "step": 3900 }, { "epoch": 0.86, "learning_rate": 6.333293878579278e-06, "logits/chosen": -1.6397393941879272, "logits/rejected": -1.5680477619171143, "logps/chosen": -62.73045349121094, "logps/rejected": -70.47763061523438, "loss": 1.1716, "rewards/accuracies": 0.0, "rewards/chosen": 2.1941444873809814, "rewards/margins": -2.2158567905426025, "rewards/rejected": 4.410001277923584, "step": 3901 }, { "epoch": 0.86, "learning_rate": 6.331566363278705e-06, "logits/chosen": -1.5196805000305176, "logits/rejected": -1.5337677001953125, "logps/chosen": -41.880496978759766, "logps/rejected": -46.30780029296875, "loss": 1.6879, "rewards/accuracies": 1.0, "rewards/chosen": 3.0724148750305176, "rewards/margins": 0.22841525077819824, "rewards/rejected": 2.8439996242523193, "step": 3902 }, { "epoch": 0.86, "learning_rate": 6.329838676875056e-06, "logits/chosen": -1.7093688249588013, "logits/rejected": -1.711918592453003, "logps/chosen": -74.97254943847656, "logps/rejected": -29.044795989990234, "loss": 1.8387, "rewards/accuracies": 0.0, "rewards/chosen": 2.510284423828125, "rewards/margins": -3.231485366821289, "rewards/rejected": 5.741769790649414, "step": 3903 }, { "epoch": 0.86, "learning_rate": 6.328110819590333e-06, "logits/chosen": -1.6989933252334595, "logits/rejected": -1.6787556409835815, "logps/chosen": -91.22003173828125, "logps/rejected": -45.60282897949219, "loss": 0.5971, "rewards/accuracies": 0.0, "rewards/chosen": 4.871936321258545, "rewards/margins": -0.8299465179443359, "rewards/rejected": 5.701882839202881, "step": 3904 }, { "epoch": 0.86, "learning_rate": 6.326382791646562e-06, "logits/chosen": -1.5324255228042603, "logits/rejected": -1.5072176456451416, "logps/chosen": -70.55070495605469, "logps/rejected": -38.95799255371094, "loss": 0.6155, "rewards/accuracies": 0.0, "rewards/chosen": 2.381723165512085, "rewards/margins": -0.5467803478240967, "rewards/rejected": 2.9285035133361816, "step": 3905 }, { "epoch": 0.86, "learning_rate": 6.324654593265792e-06, "logits/chosen": -1.6850404739379883, "logits/rejected": -1.676213026046753, "logps/chosen": -39.18083953857422, "logps/rejected": -76.61676788330078, "loss": 0.2756, "rewards/accuracies": 1.0, "rewards/chosen": 3.6304092407226562, "rewards/margins": 0.7680213451385498, "rewards/rejected": 2.8623878955841064, "step": 3906 }, { "epoch": 0.86, "learning_rate": 6.32292622467009e-06, "logits/chosen": -1.5796738862991333, "logits/rejected": -1.515071153640747, "logps/chosen": -89.79649353027344, "logps/rejected": -71.77226257324219, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": 6.17388916015625, "rewards/margins": 3.835894823074341, "rewards/rejected": 2.337994337081909, "step": 3907 }, { "epoch": 0.86, "learning_rate": 6.321197686081548e-06, "logits/chosen": -1.5505828857421875, "logits/rejected": -1.5815973281860352, "logps/chosen": -63.3780632019043, "logps/rejected": -60.40805435180664, "loss": 4.2501, "rewards/accuracies": 0.0, "rewards/chosen": 2.6459546089172363, "rewards/margins": -4.997507095336914, "rewards/rejected": 7.64346170425415, "step": 3908 }, { "epoch": 0.87, "learning_rate": 6.319468977722279e-06, "logits/chosen": -1.4729788303375244, "logits/rejected": -1.1201984882354736, "logps/chosen": -46.845096588134766, "logps/rejected": -77.71673583984375, "loss": 2.0877, "rewards/accuracies": 0.0, "rewards/chosen": 3.5696842670440674, "rewards/margins": -2.2626430988311768, "rewards/rejected": 5.832327365875244, "step": 3909 }, { "epoch": 0.87, "learning_rate": 6.31774009981442e-06, "logits/chosen": -1.6322765350341797, "logits/rejected": -1.4814094305038452, "logps/chosen": -66.7916488647461, "logps/rejected": -25.7807674407959, "loss": 0.2382, "rewards/accuracies": 1.0, "rewards/chosen": 3.0555131435394287, "rewards/margins": 2.889007568359375, "rewards/rejected": 0.1665056198835373, "step": 3910 }, { "epoch": 0.87, "learning_rate": 6.316011052580127e-06, "logits/chosen": -1.5958495140075684, "logits/rejected": -1.4743722677230835, "logps/chosen": -121.44467163085938, "logps/rejected": -43.877845764160156, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 5.680209636688232, "rewards/margins": 1.098043441772461, "rewards/rejected": 4.5821661949157715, "step": 3911 }, { "epoch": 0.87, "learning_rate": 6.314281836241573e-06, "logits/chosen": -1.3932591676712036, "logits/rejected": -1.3918836116790771, "logps/chosen": -130.46060180664062, "logps/rejected": -68.41262817382812, "loss": 1.5998, "rewards/accuracies": 0.0, "rewards/chosen": 4.955163478851318, "rewards/margins": -1.5314407348632812, "rewards/rejected": 6.4866042137146, "step": 3912 }, { "epoch": 0.87, "learning_rate": 6.312552451020966e-06, "logits/chosen": -1.6218003034591675, "logits/rejected": -1.5490343570709229, "logps/chosen": -59.47888946533203, "logps/rejected": -12.144548416137695, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 5.742197513580322, "rewards/margins": 4.098283767700195, "rewards/rejected": 1.6439136266708374, "step": 3913 }, { "epoch": 0.87, "learning_rate": 6.310822897140523e-06, "logits/chosen": -1.9461809396743774, "logits/rejected": -1.9461809396743774, "logps/chosen": -41.64105224609375, "logps/rejected": -41.64105224609375, "loss": 0.3751, "rewards/accuracies": 0.0, "rewards/chosen": 4.746351718902588, "rewards/margins": 0.0, "rewards/rejected": 4.746351718902588, "step": 3914 }, { "epoch": 0.87, "learning_rate": 6.30909317482249e-06, "logits/chosen": -1.4161744117736816, "logits/rejected": -1.3584709167480469, "logps/chosen": -51.20198059082031, "logps/rejected": -66.27473449707031, "loss": 0.6367, "rewards/accuracies": 0.0, "rewards/chosen": 3.7815208435058594, "rewards/margins": -0.17850565910339355, "rewards/rejected": 3.960026502609253, "step": 3915 }, { "epoch": 0.87, "learning_rate": 6.30736328428913e-06, "logits/chosen": -1.6401821374893188, "logits/rejected": -1.671606421470642, "logps/chosen": -70.83499908447266, "logps/rejected": -87.44570922851562, "loss": 1.7864, "rewards/accuracies": 0.0, "rewards/chosen": 2.429286241531372, "rewards/margins": -3.104545831680298, "rewards/rejected": 5.53383207321167, "step": 3916 }, { "epoch": 0.87, "learning_rate": 6.30563322576273e-06, "logits/chosen": -1.4377485513687134, "logits/rejected": -1.3986904621124268, "logps/chosen": -66.81326293945312, "logps/rejected": -73.40321350097656, "loss": 0.2439, "rewards/accuracies": 1.0, "rewards/chosen": 3.2002480030059814, "rewards/margins": 0.531684160232544, "rewards/rejected": 2.6685638427734375, "step": 3917 }, { "epoch": 0.87, "learning_rate": 6.303902999465601e-06, "logits/chosen": -1.7092597484588623, "logits/rejected": -1.7092597484588623, "logps/chosen": -64.60247039794922, "logps/rejected": -64.60247039794922, "loss": 0.5464, "rewards/accuracies": 0.0, "rewards/chosen": 4.181877136230469, "rewards/margins": 0.0, "rewards/rejected": 4.181877136230469, "step": 3918 }, { "epoch": 0.87, "learning_rate": 6.30217260562007e-06, "logits/chosen": -1.6594345569610596, "logits/rejected": -1.6417181491851807, "logps/chosen": -113.57878875732422, "logps/rejected": -241.67918395996094, "loss": 2.4309, "rewards/accuracies": 0.0, "rewards/chosen": 7.465683937072754, "rewards/margins": -2.9836339950561523, "rewards/rejected": 10.449317932128906, "step": 3919 }, { "epoch": 0.87, "learning_rate": 6.300442044448491e-06, "logits/chosen": -1.8339614868164062, "logits/rejected": -1.8209288120269775, "logps/chosen": -122.94415283203125, "logps/rejected": -97.6067886352539, "loss": 0.1762, "rewards/accuracies": 1.0, "rewards/chosen": 8.233894348144531, "rewards/margins": 0.8811850547790527, "rewards/rejected": 7.3527092933654785, "step": 3920 }, { "epoch": 0.87, "learning_rate": 6.298711316173234e-06, "logits/chosen": -1.87883722782135, "logits/rejected": -1.7591041326522827, "logps/chosen": -44.53874969482422, "logps/rejected": -39.04038619995117, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": 5.1147685050964355, "rewards/margins": 3.120392322540283, "rewards/rejected": 1.9943760633468628, "step": 3921 }, { "epoch": 0.87, "learning_rate": 6.296980421016694e-06, "logits/chosen": -1.7913779020309448, "logits/rejected": -1.7519267797470093, "logps/chosen": -133.39035034179688, "logps/rejected": -98.43927001953125, "loss": 0.9591, "rewards/accuracies": 0.0, "rewards/chosen": 6.177409648895264, "rewards/margins": -1.446253776550293, "rewards/rejected": 7.623663425445557, "step": 3922 }, { "epoch": 0.87, "learning_rate": 6.295249359201288e-06, "logits/chosen": -1.8970208168029785, "logits/rejected": -1.868811845779419, "logps/chosen": -110.22145080566406, "logps/rejected": -64.65766143798828, "loss": 0.919, "rewards/accuracies": 1.0, "rewards/chosen": 7.103068828582764, "rewards/margins": 3.6573970317840576, "rewards/rejected": 3.445671796798706, "step": 3923 }, { "epoch": 0.87, "learning_rate": 6.293518130949454e-06, "logits/chosen": -1.5298963785171509, "logits/rejected": -1.4655687808990479, "logps/chosen": -99.47573852539062, "logps/rejected": -79.54678344726562, "loss": 0.1383, "rewards/accuracies": 1.0, "rewards/chosen": 7.381219387054443, "rewards/margins": 2.340646266937256, "rewards/rejected": 5.0405731201171875, "step": 3924 }, { "epoch": 0.87, "learning_rate": 6.29178673648365e-06, "logits/chosen": -1.7448896169662476, "logits/rejected": -1.7149245738983154, "logps/chosen": -63.203094482421875, "logps/rejected": -52.881351470947266, "loss": 0.1503, "rewards/accuracies": 1.0, "rewards/chosen": 3.721219778060913, "rewards/margins": 1.060363531112671, "rewards/rejected": 2.660856246948242, "step": 3925 }, { "epoch": 0.87, "learning_rate": 6.2900551760263564e-06, "logits/chosen": -1.7153785228729248, "logits/rejected": -1.6941636800765991, "logps/chosen": -118.45819091796875, "logps/rejected": -146.07469177246094, "loss": 1.5002, "rewards/accuracies": 0.0, "rewards/chosen": 5.266208171844482, "rewards/margins": -2.2100095748901367, "rewards/rejected": 7.476217746734619, "step": 3926 }, { "epoch": 0.87, "learning_rate": 6.288323449800072e-06, "logits/chosen": -1.825987696647644, "logits/rejected": -1.7679094076156616, "logps/chosen": -56.06989288330078, "logps/rejected": -23.09772300720215, "loss": 0.7073, "rewards/accuracies": 1.0, "rewards/chosen": 3.0413780212402344, "rewards/margins": 2.487856388092041, "rewards/rejected": 0.5535217523574829, "step": 3927 }, { "epoch": 0.87, "learning_rate": 6.286591558027322e-06, "logits/chosen": -1.3085280656814575, "logits/rejected": -1.2681753635406494, "logps/chosen": -32.6035270690918, "logps/rejected": -21.789426803588867, "loss": 1.0391, "rewards/accuracies": 0.0, "rewards/chosen": 1.5289524793624878, "rewards/margins": -1.0198124647140503, "rewards/rejected": 2.548764944076538, "step": 3928 }, { "epoch": 0.87, "learning_rate": 6.284859500930651e-06, "logits/chosen": -1.7235373258590698, "logits/rejected": -1.7245012521743774, "logps/chosen": -52.72173309326172, "logps/rejected": -95.8853759765625, "loss": 0.6306, "rewards/accuracies": 0.0, "rewards/chosen": 3.785398244857788, "rewards/margins": -0.9173314571380615, "rewards/rejected": 4.70272970199585, "step": 3929 }, { "epoch": 0.87, "learning_rate": 6.283127278732622e-06, "logits/chosen": -1.4207097291946411, "logits/rejected": -1.4846346378326416, "logps/chosen": -27.092159271240234, "logps/rejected": -108.10687255859375, "loss": 1.3188, "rewards/accuracies": 0.0, "rewards/chosen": 2.4089410305023193, "rewards/margins": -1.239565134048462, "rewards/rejected": 3.6485061645507812, "step": 3930 }, { "epoch": 0.87, "learning_rate": 6.281394891655821e-06, "logits/chosen": -1.4698013067245483, "logits/rejected": -1.4186805486679077, "logps/chosen": -40.455482482910156, "logps/rejected": -26.854516983032227, "loss": 1.037, "rewards/accuracies": 0.0, "rewards/chosen": 1.9562866687774658, "rewards/margins": -0.6017670631408691, "rewards/rejected": 2.558053731918335, "step": 3931 }, { "epoch": 0.87, "learning_rate": 6.279662339922858e-06, "logits/chosen": -1.7633193731307983, "logits/rejected": -1.6905454397201538, "logps/chosen": -139.2384490966797, "logps/rejected": -39.766380310058594, "loss": 0.0567, "rewards/accuracies": 1.0, "rewards/chosen": 7.110557556152344, "rewards/margins": 4.594637870788574, "rewards/rejected": 2.5159194469451904, "step": 3932 }, { "epoch": 0.87, "learning_rate": 6.277929623756359e-06, "logits/chosen": -1.4687516689300537, "logits/rejected": -1.5084482431411743, "logps/chosen": -34.922969818115234, "logps/rejected": -50.05003356933594, "loss": 0.7892, "rewards/accuracies": 0.0, "rewards/chosen": 2.4867374897003174, "rewards/margins": -1.164536714553833, "rewards/rejected": 3.6512742042541504, "step": 3933 }, { "epoch": 0.87, "learning_rate": 6.276196743378977e-06, "logits/chosen": -1.7068614959716797, "logits/rejected": -1.647529125213623, "logps/chosen": -85.84532928466797, "logps/rejected": -81.02972412109375, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 6.962239742279053, "rewards/margins": 4.655927658081055, "rewards/rejected": 2.306311845779419, "step": 3934 }, { "epoch": 0.87, "learning_rate": 6.27446369901338e-06, "logits/chosen": -1.77814781665802, "logits/rejected": -1.7743752002716064, "logps/chosen": -173.8028106689453, "logps/rejected": -74.97775268554688, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": 5.898404121398926, "rewards/margins": 2.9513306617736816, "rewards/rejected": 2.947073459625244, "step": 3935 }, { "epoch": 0.87, "learning_rate": 6.272730490882264e-06, "logits/chosen": -1.9414901733398438, "logits/rejected": -1.8920866250991821, "logps/chosen": -82.07839965820312, "logps/rejected": -57.49810028076172, "loss": 0.0984, "rewards/accuracies": 1.0, "rewards/chosen": 5.526666164398193, "rewards/margins": 2.664341688156128, "rewards/rejected": 2.8623244762420654, "step": 3936 }, { "epoch": 0.87, "learning_rate": 6.270997119208339e-06, "logits/chosen": -1.6737070083618164, "logits/rejected": -1.6602524518966675, "logps/chosen": -27.441526412963867, "logps/rejected": -38.76380920410156, "loss": 0.9296, "rewards/accuracies": 0.0, "rewards/chosen": 1.9684690237045288, "rewards/margins": -1.6894174814224243, "rewards/rejected": 3.657886505126953, "step": 3937 }, { "epoch": 0.87, "learning_rate": 6.269263584214338e-06, "logits/chosen": -1.5684341192245483, "logits/rejected": -1.4975175857543945, "logps/chosen": -90.2059097290039, "logps/rejected": -141.63348388671875, "loss": 0.5105, "rewards/accuracies": 0.0, "rewards/chosen": 7.633098602294922, "rewards/margins": -0.21373701095581055, "rewards/rejected": 7.846835613250732, "step": 3938 }, { "epoch": 0.87, "learning_rate": 6.267529886123018e-06, "logits/chosen": -1.5655611753463745, "logits/rejected": -1.5092830657958984, "logps/chosen": -43.41380310058594, "logps/rejected": -62.69499969482422, "loss": 1.0815, "rewards/accuracies": 0.0, "rewards/chosen": 2.7318832874298096, "rewards/margins": -0.29257893562316895, "rewards/rejected": 3.0244622230529785, "step": 3939 }, { "epoch": 0.87, "learning_rate": 6.265796025157154e-06, "logits/chosen": -1.772768497467041, "logits/rejected": -1.7229050397872925, "logps/chosen": -57.03144073486328, "logps/rejected": -35.637413024902344, "loss": 0.4858, "rewards/accuracies": 1.0, "rewards/chosen": 1.794281005859375, "rewards/margins": 1.1363956928253174, "rewards/rejected": 0.6578853726387024, "step": 3940 }, { "epoch": 0.87, "learning_rate": 6.264062001539545e-06, "logits/chosen": -1.7160347700119019, "logits/rejected": -1.6614270210266113, "logps/chosen": -80.37718200683594, "logps/rejected": -63.85283660888672, "loss": 0.4597, "rewards/accuracies": 0.0, "rewards/chosen": 3.1490981578826904, "rewards/margins": -0.4069511890411377, "rewards/rejected": 3.556049346923828, "step": 3941 }, { "epoch": 0.87, "learning_rate": 6.262327815493005e-06, "logits/chosen": -1.9278465509414673, "logits/rejected": -1.8649587631225586, "logps/chosen": -102.12493896484375, "logps/rejected": -73.64868927001953, "loss": 1.0525, "rewards/accuracies": 1.0, "rewards/chosen": 7.555419921875, "rewards/margins": 3.9409079551696777, "rewards/rejected": 3.6145119667053223, "step": 3942 }, { "epoch": 0.87, "learning_rate": 6.260593467240378e-06, "logits/chosen": -1.8375630378723145, "logits/rejected": -1.8404335975646973, "logps/chosen": -70.78157043457031, "logps/rejected": -68.7861099243164, "loss": 1.7363, "rewards/accuracies": 0.0, "rewards/chosen": 2.4492690563201904, "rewards/margins": -3.4380671977996826, "rewards/rejected": 5.887336254119873, "step": 3943 }, { "epoch": 0.87, "learning_rate": 6.25885895700452e-06, "logits/chosen": -1.5925824642181396, "logits/rejected": -1.6679587364196777, "logps/chosen": -67.14918518066406, "logps/rejected": -80.26713562011719, "loss": 2.0587, "rewards/accuracies": 0.0, "rewards/chosen": 2.3478424549102783, "rewards/margins": -3.912144422531128, "rewards/rejected": 6.259986877441406, "step": 3944 }, { "epoch": 0.87, "learning_rate": 6.257124285008313e-06, "logits/chosen": -1.894216537475586, "logits/rejected": -1.880656123161316, "logps/chosen": -43.99220657348633, "logps/rejected": -28.274234771728516, "loss": 0.7709, "rewards/accuracies": 0.0, "rewards/chosen": 2.841196060180664, "rewards/margins": -1.0218384265899658, "rewards/rejected": 3.86303448677063, "step": 3945 }, { "epoch": 0.87, "learning_rate": 6.255389451474658e-06, "logits/chosen": -1.421793818473816, "logits/rejected": -1.3290503025054932, "logps/chosen": -53.3919563293457, "logps/rejected": -12.663838386535645, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": 3.8236629962921143, "rewards/margins": 3.650428533554077, "rewards/rejected": 0.17323437333106995, "step": 3946 }, { "epoch": 0.87, "learning_rate": 6.253654456626475e-06, "logits/chosen": -1.578087568283081, "logits/rejected": -1.4322601556777954, "logps/chosen": -63.678070068359375, "logps/rejected": -98.47138977050781, "loss": 0.2644, "rewards/accuracies": 1.0, "rewards/chosen": 5.896690368652344, "rewards/margins": 4.796053409576416, "rewards/rejected": 1.1006370782852173, "step": 3947 }, { "epoch": 0.87, "learning_rate": 6.251919300686708e-06, "logits/chosen": -1.469449758529663, "logits/rejected": -1.5726909637451172, "logps/chosen": -31.462539672851562, "logps/rejected": -58.81930160522461, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 2.5793251991271973, "rewards/margins": -1.0092830657958984, "rewards/rejected": 3.5886082649230957, "step": 3948 }, { "epoch": 0.87, "learning_rate": 6.25018398387832e-06, "logits/chosen": -1.8056371212005615, "logits/rejected": -1.7301809787750244, "logps/chosen": -149.1851043701172, "logps/rejected": -111.05754089355469, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 6.185240268707275, "rewards/margins": 2.3184707164764404, "rewards/rejected": 3.866769552230835, "step": 3949 }, { "epoch": 0.87, "learning_rate": 6.248448506424296e-06, "logits/chosen": -1.9584708213806152, "logits/rejected": -1.9365546703338623, "logps/chosen": -75.62075805664062, "logps/rejected": -56.29914855957031, "loss": 0.1817, "rewards/accuracies": 1.0, "rewards/chosen": 4.406684398651123, "rewards/margins": 1.110611915588379, "rewards/rejected": 3.296072483062744, "step": 3950 }, { "epoch": 0.87, "learning_rate": 6.246712868547639e-06, "logits/chosen": -1.4510210752487183, "logits/rejected": -1.3351714611053467, "logps/chosen": -59.690223693847656, "logps/rejected": -8.150223731994629, "loss": 0.3532, "rewards/accuracies": 1.0, "rewards/chosen": 4.973923683166504, "rewards/margins": 4.316987991333008, "rewards/rejected": 0.6569356322288513, "step": 3951 }, { "epoch": 0.87, "learning_rate": 6.244977070471377e-06, "logits/chosen": -1.46484375, "logits/rejected": -1.4678075313568115, "logps/chosen": -9.394746780395508, "logps/rejected": -3.543592691421509, "loss": 0.8876, "rewards/accuracies": 1.0, "rewards/chosen": 2.8912527561187744, "rewards/margins": 2.3011016845703125, "rewards/rejected": 0.5901510119438171, "step": 3952 }, { "epoch": 0.87, "learning_rate": 6.243241112418554e-06, "logits/chosen": -1.5650863647460938, "logits/rejected": -1.3127809762954712, "logps/chosen": -90.29169464111328, "logps/rejected": -33.14133071899414, "loss": 0.1906, "rewards/accuracies": 1.0, "rewards/chosen": 4.2339348793029785, "rewards/margins": 3.842550039291382, "rewards/rejected": 0.39138489961624146, "step": 3953 }, { "epoch": 0.88, "learning_rate": 6.241504994612237e-06, "logits/chosen": -1.6538900136947632, "logits/rejected": -1.6052342653274536, "logps/chosen": -57.529300689697266, "logps/rejected": -38.864013671875, "loss": 0.1711, "rewards/accuracies": 1.0, "rewards/chosen": 3.9564969539642334, "rewards/margins": 1.179464340209961, "rewards/rejected": 2.7770326137542725, "step": 3954 }, { "epoch": 0.88, "learning_rate": 6.239768717275512e-06, "logits/chosen": -1.4270579814910889, "logits/rejected": -1.2969330549240112, "logps/chosen": -49.88311767578125, "logps/rejected": -27.85641098022461, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": 4.554498195648193, "rewards/margins": 3.380143642425537, "rewards/rejected": 1.1743545532226562, "step": 3955 }, { "epoch": 0.88, "learning_rate": 6.238032280631487e-06, "logits/chosen": -1.6958093643188477, "logits/rejected": -1.7320328950881958, "logps/chosen": -54.59439468383789, "logps/rejected": -105.37593078613281, "loss": 1.8509, "rewards/accuracies": 1.0, "rewards/chosen": 3.318469762802124, "rewards/margins": 1.310713768005371, "rewards/rejected": 2.007755994796753, "step": 3956 }, { "epoch": 0.88, "learning_rate": 6.236295684903291e-06, "logits/chosen": -1.3605151176452637, "logits/rejected": -1.1359471082687378, "logps/chosen": -135.26409912109375, "logps/rejected": -103.56228637695312, "loss": 1.0793, "rewards/accuracies": 0.0, "rewards/chosen": 5.621243476867676, "rewards/margins": -0.13064861297607422, "rewards/rejected": 5.75189208984375, "step": 3957 }, { "epoch": 0.88, "learning_rate": 6.234558930314071e-06, "logits/chosen": -2.042923927307129, "logits/rejected": -1.959844946861267, "logps/chosen": -56.005332946777344, "logps/rejected": -15.329947471618652, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 3.7607057094573975, "rewards/margins": 3.4728589057922363, "rewards/rejected": 0.2878468632698059, "step": 3958 }, { "epoch": 0.88, "learning_rate": 6.232822017086996e-06, "logits/chosen": -1.802734613418579, "logits/rejected": -1.7206640243530273, "logps/chosen": -49.419219970703125, "logps/rejected": -58.31175231933594, "loss": 2.0399, "rewards/accuracies": 0.0, "rewards/chosen": 4.016997814178467, "rewards/margins": -2.4079084396362305, "rewards/rejected": 6.424906253814697, "step": 3959 }, { "epoch": 0.88, "learning_rate": 6.231084945445255e-06, "logits/chosen": -1.870276689529419, "logits/rejected": -1.844771146774292, "logps/chosen": -141.33676147460938, "logps/rejected": -131.07583618164062, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": 8.378554344177246, "rewards/margins": 1.5753026008605957, "rewards/rejected": 6.80325174331665, "step": 3960 }, { "epoch": 0.88, "learning_rate": 6.229347715612059e-06, "logits/chosen": -1.6145883798599243, "logits/rejected": -1.5673937797546387, "logps/chosen": -91.40364837646484, "logps/rejected": -64.0855712890625, "loss": 0.9556, "rewards/accuracies": 0.0, "rewards/chosen": 2.8885161876678467, "rewards/margins": -1.6885955333709717, "rewards/rejected": 4.577111721038818, "step": 3961 }, { "epoch": 0.88, "learning_rate": 6.227610327810636e-06, "logits/chosen": -1.84273099899292, "logits/rejected": -1.8114489316940308, "logps/chosen": -38.05753707885742, "logps/rejected": -145.4178466796875, "loss": 0.7126, "rewards/accuracies": 1.0, "rewards/chosen": 2.232454776763916, "rewards/margins": 0.31326377391815186, "rewards/rejected": 1.9191910028457642, "step": 3962 }, { "epoch": 0.88, "learning_rate": 6.2258727822642376e-06, "logits/chosen": -1.9882022142410278, "logits/rejected": -1.9189295768737793, "logps/chosen": -54.91255187988281, "logps/rejected": -20.9478759765625, "loss": 1.1748, "rewards/accuracies": 1.0, "rewards/chosen": 3.037724256515503, "rewards/margins": 1.2691295146942139, "rewards/rejected": 1.768594741821289, "step": 3963 }, { "epoch": 0.88, "learning_rate": 6.224135079196133e-06, "logits/chosen": -1.8532156944274902, "logits/rejected": -1.8279979228973389, "logps/chosen": -145.4294891357422, "logps/rejected": -62.3315544128418, "loss": 0.3857, "rewards/accuracies": 0.0, "rewards/chosen": 6.48996114730835, "rewards/margins": -0.09552001953125, "rewards/rejected": 6.5854811668396, "step": 3964 }, { "epoch": 0.88, "learning_rate": 6.2223972188296135e-06, "logits/chosen": -1.5871648788452148, "logits/rejected": -1.5985033512115479, "logps/chosen": -50.40030288696289, "logps/rejected": -80.78268432617188, "loss": 1.4749, "rewards/accuracies": 0.0, "rewards/chosen": 0.9898540377616882, "rewards/margins": -1.6816165447235107, "rewards/rejected": 2.6714706420898438, "step": 3965 }, { "epoch": 0.88, "learning_rate": 6.2206592013879895e-06, "logits/chosen": -1.7866287231445312, "logits/rejected": -1.7866287231445312, "logps/chosen": -82.49307250976562, "logps/rejected": -82.49307250976562, "loss": 0.4101, "rewards/accuracies": 0.0, "rewards/chosen": 3.9569923877716064, "rewards/margins": 0.0, "rewards/rejected": 3.9569923877716064, "step": 3966 }, { "epoch": 0.88, "learning_rate": 6.218921027094594e-06, "logits/chosen": -1.3383734226226807, "logits/rejected": -1.2787270545959473, "logps/chosen": -42.92118835449219, "logps/rejected": -70.4317855834961, "loss": 2.8021, "rewards/accuracies": 0.0, "rewards/chosen": 3.4790236949920654, "rewards/margins": -0.7351577281951904, "rewards/rejected": 4.214181423187256, "step": 3967 }, { "epoch": 0.88, "learning_rate": 6.217182696172776e-06, "logits/chosen": -1.7223953008651733, "logits/rejected": -1.72017240524292, "logps/chosen": -154.6331787109375, "logps/rejected": -78.17420959472656, "loss": 0.9893, "rewards/accuracies": 0.0, "rewards/chosen": 4.116611003875732, "rewards/margins": -1.8228073120117188, "rewards/rejected": 5.939418315887451, "step": 3968 }, { "epoch": 0.88, "learning_rate": 6.215444208845907e-06, "logits/chosen": -2.0890135765075684, "logits/rejected": -2.0948822498321533, "logps/chosen": -111.88874816894531, "logps/rejected": -72.93144989013672, "loss": 0.7601, "rewards/accuracies": 1.0, "rewards/chosen": 6.646775722503662, "rewards/margins": 4.632745742797852, "rewards/rejected": 2.0140297412872314, "step": 3969 }, { "epoch": 0.88, "learning_rate": 6.21370556533738e-06, "logits/chosen": -1.760325312614441, "logits/rejected": -1.7126753330230713, "logps/chosen": -35.278846740722656, "logps/rejected": -25.548810958862305, "loss": 0.5199, "rewards/accuracies": 1.0, "rewards/chosen": 1.8032020330429077, "rewards/margins": 0.5818425416946411, "rewards/rejected": 1.2213594913482666, "step": 3970 }, { "epoch": 0.88, "learning_rate": 6.2119667658706055e-06, "logits/chosen": -1.2477424144744873, "logits/rejected": -1.3570854663848877, "logps/chosen": -103.3255615234375, "logps/rejected": -144.21566772460938, "loss": 1.8301, "rewards/accuracies": 0.0, "rewards/chosen": 2.220310926437378, "rewards/margins": -3.5962741374969482, "rewards/rejected": 5.816585063934326, "step": 3971 }, { "epoch": 0.88, "learning_rate": 6.210227810669014e-06, "logits/chosen": -1.7032999992370605, "logits/rejected": -1.6907451152801514, "logps/chosen": -19.64702796936035, "logps/rejected": -36.90879821777344, "loss": 0.4709, "rewards/accuracies": 1.0, "rewards/chosen": 2.0654356479644775, "rewards/margins": 0.93593430519104, "rewards/rejected": 1.1295013427734375, "step": 3972 }, { "epoch": 0.88, "learning_rate": 6.208488699956059e-06, "logits/chosen": -1.5105546712875366, "logits/rejected": -1.4565651416778564, "logps/chosen": -51.561790466308594, "logps/rejected": -32.57044219970703, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": 3.0888679027557373, "rewards/margins": 0.7349162101745605, "rewards/rejected": 2.3539516925811768, "step": 3973 }, { "epoch": 0.88, "learning_rate": 6.20674943395521e-06, "logits/chosen": -1.34657883644104, "logits/rejected": -1.3631172180175781, "logps/chosen": -38.927825927734375, "logps/rejected": -54.247947692871094, "loss": 1.9327, "rewards/accuracies": 0.0, "rewards/chosen": 2.075026750564575, "rewards/margins": -3.82859206199646, "rewards/rejected": 5.903618812561035, "step": 3974 }, { "epoch": 0.88, "learning_rate": 6.205010012889962e-06, "logits/chosen": -1.459905982017517, "logits/rejected": -1.4509601593017578, "logps/chosen": -84.10005187988281, "logps/rejected": -56.89116668701172, "loss": 0.27, "rewards/accuracies": 1.0, "rewards/chosen": 2.5940933227539062, "rewards/margins": 0.3600027561187744, "rewards/rejected": 2.234090566635132, "step": 3975 }, { "epoch": 0.88, "learning_rate": 6.2032704369838214e-06, "logits/chosen": -1.7976361513137817, "logits/rejected": -1.7748769521713257, "logps/chosen": -86.45626831054688, "logps/rejected": -41.42589569091797, "loss": 3.0547, "rewards/accuracies": 0.0, "rewards/chosen": 3.5508956909179688, "rewards/margins": -0.3140525817871094, "rewards/rejected": 3.864948272705078, "step": 3976 }, { "epoch": 0.88, "learning_rate": 6.201530706460324e-06, "logits/chosen": -1.7571814060211182, "logits/rejected": -1.7289743423461914, "logps/chosen": -43.70091247558594, "logps/rejected": -43.7647705078125, "loss": 2.092, "rewards/accuracies": 0.0, "rewards/chosen": 1.743447184562683, "rewards/margins": -1.013095736503601, "rewards/rejected": 2.756542921066284, "step": 3977 }, { "epoch": 0.88, "learning_rate": 6.199790821543019e-06, "logits/chosen": -1.8832895755767822, "logits/rejected": -1.8296149969100952, "logps/chosen": -67.36387634277344, "logps/rejected": -78.06190490722656, "loss": 2.1957, "rewards/accuracies": 0.0, "rewards/chosen": 1.5748611688613892, "rewards/margins": -3.040651798248291, "rewards/rejected": 4.615512847900391, "step": 3978 }, { "epoch": 0.88, "learning_rate": 6.198050782455478e-06, "logits/chosen": -1.5389611721038818, "logits/rejected": -1.5146257877349854, "logps/chosen": -39.42010498046875, "logps/rejected": -60.70026397705078, "loss": 0.4423, "rewards/accuracies": 0.0, "rewards/chosen": 3.5904266834259033, "rewards/margins": -0.041678667068481445, "rewards/rejected": 3.6321053504943848, "step": 3979 }, { "epoch": 0.88, "learning_rate": 6.19631058942129e-06, "logits/chosen": -1.2723172903060913, "logits/rejected": -1.2671459913253784, "logps/chosen": -51.61280059814453, "logps/rejected": -61.42176055908203, "loss": 0.6022, "rewards/accuracies": 1.0, "rewards/chosen": 2.8869354724884033, "rewards/margins": 0.40434885025024414, "rewards/rejected": 2.482586622238159, "step": 3980 }, { "epoch": 0.88, "learning_rate": 6.194570242664069e-06, "logits/chosen": -1.9051604270935059, "logits/rejected": -1.93142569065094, "logps/chosen": -97.35366821289062, "logps/rejected": -114.04443359375, "loss": 0.5155, "rewards/accuracies": 0.0, "rewards/chosen": 7.848117351531982, "rewards/margins": -0.5815615653991699, "rewards/rejected": 8.429678916931152, "step": 3981 }, { "epoch": 0.88, "learning_rate": 6.192829742407442e-06, "logits/chosen": -1.9107311964035034, "logits/rejected": -1.807064414024353, "logps/chosen": -35.677589416503906, "logps/rejected": -31.309656143188477, "loss": 0.746, "rewards/accuracies": 0.0, "rewards/chosen": 2.33109974861145, "rewards/margins": -1.1352486610412598, "rewards/rejected": 3.46634840965271, "step": 3982 }, { "epoch": 0.88, "learning_rate": 6.1910890888750605e-06, "logits/chosen": -1.722877860069275, "logits/rejected": -1.6599206924438477, "logps/chosen": -37.195709228515625, "logps/rejected": -34.052223205566406, "loss": 0.2464, "rewards/accuracies": 1.0, "rewards/chosen": 4.843049049377441, "rewards/margins": 0.5626397132873535, "rewards/rejected": 4.280409336090088, "step": 3983 }, { "epoch": 0.88, "learning_rate": 6.189348282290595e-06, "logits/chosen": -1.7426482439041138, "logits/rejected": -1.752342700958252, "logps/chosen": -66.90665435791016, "logps/rejected": -59.31125259399414, "loss": 1.5605, "rewards/accuracies": 0.0, "rewards/chosen": 3.9015634059906006, "rewards/margins": -2.7311699390411377, "rewards/rejected": 6.632733345031738, "step": 3984 }, { "epoch": 0.88, "learning_rate": 6.1876073228777344e-06, "logits/chosen": -1.95012366771698, "logits/rejected": -1.8160287141799927, "logps/chosen": -83.70948028564453, "logps/rejected": -18.24667739868164, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": 6.667726993560791, "rewards/margins": 4.4435224533081055, "rewards/rejected": 2.2242047786712646, "step": 3985 }, { "epoch": 0.88, "learning_rate": 6.185866210860189e-06, "logits/chosen": -1.5188276767730713, "logits/rejected": -1.5188276767730713, "logps/chosen": -57.315521240234375, "logps/rejected": -57.315521240234375, "loss": 0.8568, "rewards/accuracies": 0.0, "rewards/chosen": 3.051901340484619, "rewards/margins": 0.0, "rewards/rejected": 3.051901340484619, "step": 3986 }, { "epoch": 0.88, "learning_rate": 6.184124946461684e-06, "logits/chosen": -1.3882423639297485, "logits/rejected": -1.3355008363723755, "logps/chosen": -37.132259368896484, "logps/rejected": -41.340003967285156, "loss": 0.8105, "rewards/accuracies": 0.0, "rewards/chosen": 2.3550548553466797, "rewards/margins": -0.5861599445343018, "rewards/rejected": 2.9412147998809814, "step": 3987 }, { "epoch": 0.88, "learning_rate": 6.182383529905972e-06, "logits/chosen": -1.7337111234664917, "logits/rejected": -1.6033596992492676, "logps/chosen": -108.92515563964844, "logps/rejected": -47.30708694458008, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 6.599890232086182, "rewards/margins": 5.593287944793701, "rewards/rejected": 1.006602168083191, "step": 3988 }, { "epoch": 0.88, "learning_rate": 6.180641961416817e-06, "logits/chosen": -1.7694334983825684, "logits/rejected": -1.7552345991134644, "logps/chosen": -130.9669647216797, "logps/rejected": -107.02965545654297, "loss": 3.4847, "rewards/accuracies": 0.0, "rewards/chosen": 4.886796474456787, "rewards/margins": -4.120023250579834, "rewards/rejected": 9.006819725036621, "step": 3989 }, { "epoch": 0.88, "learning_rate": 6.178900241218009e-06, "logits/chosen": -1.97252357006073, "logits/rejected": -1.9385467767715454, "logps/chosen": -45.244483947753906, "logps/rejected": -37.64316940307617, "loss": 1.1714, "rewards/accuracies": 0.0, "rewards/chosen": 3.138904571533203, "rewards/margins": -0.16407513618469238, "rewards/rejected": 3.3029797077178955, "step": 3990 }, { "epoch": 0.88, "learning_rate": 6.177158369533354e-06, "logits/chosen": -1.711706280708313, "logits/rejected": -1.617279052734375, "logps/chosen": -101.80668640136719, "logps/rejected": -55.70926284790039, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 5.669167995452881, "rewards/margins": 4.241645812988281, "rewards/rejected": 1.4275219440460205, "step": 3991 }, { "epoch": 0.88, "learning_rate": 6.175416346586677e-06, "logits/chosen": -1.6826426982879639, "logits/rejected": -1.5342973470687866, "logps/chosen": -56.57819747924805, "logps/rejected": -24.51403045654297, "loss": 0.1905, "rewards/accuracies": 1.0, "rewards/chosen": 2.3192737102508545, "rewards/margins": 0.8284080028533936, "rewards/rejected": 1.490865707397461, "step": 3992 }, { "epoch": 0.88, "learning_rate": 6.173674172601826e-06, "logits/chosen": -1.6164264678955078, "logits/rejected": -1.6247197389602661, "logps/chosen": -36.523921966552734, "logps/rejected": -50.40354919433594, "loss": 0.288, "rewards/accuracies": 1.0, "rewards/chosen": 2.047537326812744, "rewards/margins": 0.25886237621307373, "rewards/rejected": 1.7886749505996704, "step": 3993 }, { "epoch": 0.88, "learning_rate": 6.171931847802665e-06, "logits/chosen": -1.8623042106628418, "logits/rejected": -1.8419084548950195, "logps/chosen": -53.927005767822266, "logps/rejected": -39.76890182495117, "loss": 1.6177, "rewards/accuracies": 0.0, "rewards/chosen": 2.567869186401367, "rewards/margins": -1.1575310230255127, "rewards/rejected": 3.72540020942688, "step": 3994 }, { "epoch": 0.88, "learning_rate": 6.17018937241308e-06, "logits/chosen": -1.9135994911193848, "logits/rejected": -1.8059959411621094, "logps/chosen": -103.26148986816406, "logps/rejected": -126.71796417236328, "loss": 1.0877, "rewards/accuracies": 0.0, "rewards/chosen": 4.653092861175537, "rewards/margins": -1.999100685119629, "rewards/rejected": 6.652193546295166, "step": 3995 }, { "epoch": 0.88, "learning_rate": 6.168446746656973e-06, "logits/chosen": -1.459344744682312, "logits/rejected": -1.4693715572357178, "logps/chosen": -159.06143188476562, "logps/rejected": -144.1624298095703, "loss": 1.8032, "rewards/accuracies": 0.0, "rewards/chosen": 6.841011047363281, "rewards/margins": -2.9983978271484375, "rewards/rejected": 9.839408874511719, "step": 3996 }, { "epoch": 0.88, "learning_rate": 6.1667039707582685e-06, "logits/chosen": -1.6331337690353394, "logits/rejected": -1.5047887563705444, "logps/chosen": -40.92406463623047, "logps/rejected": -11.29737663269043, "loss": 0.1842, "rewards/accuracies": 1.0, "rewards/chosen": 3.7434792518615723, "rewards/margins": 3.0330276489257812, "rewards/rejected": 0.7104517221450806, "step": 3997 }, { "epoch": 0.88, "learning_rate": 6.164961044940907e-06, "logits/chosen": -1.6512644290924072, "logits/rejected": -1.6508324146270752, "logps/chosen": -57.65795135498047, "logps/rejected": -30.367168426513672, "loss": 0.7644, "rewards/accuracies": 0.0, "rewards/chosen": 3.0275375843048096, "rewards/margins": -0.26723432540893555, "rewards/rejected": 3.294771909713745, "step": 3998 }, { "epoch": 0.89, "learning_rate": 6.163217969428853e-06, "logits/chosen": -1.5901333093643188, "logits/rejected": -1.6035059690475464, "logps/chosen": -44.13435745239258, "logps/rejected": -40.2693977355957, "loss": 0.5315, "rewards/accuracies": 0.0, "rewards/chosen": 2.6620571613311768, "rewards/margins": -0.4121108055114746, "rewards/rejected": 3.0741679668426514, "step": 3999 }, { "epoch": 0.89, "learning_rate": 6.161474744446086e-06, "logits/chosen": -1.7306017875671387, "logits/rejected": -1.705931544303894, "logps/chosen": -66.98112487792969, "logps/rejected": -89.81922912597656, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": 5.809359073638916, "rewards/margins": 3.157137393951416, "rewards/rejected": 2.6522216796875, "step": 4000 }, { "epoch": 0.89, "learning_rate": 6.159731370216605e-06, "logits/chosen": -1.3127952814102173, "logits/rejected": -1.3127952814102173, "logps/chosen": -35.59571075439453, "logps/rejected": -35.59571075439453, "loss": 0.3498, "rewards/accuracies": 0.0, "rewards/chosen": 4.574962615966797, "rewards/margins": 0.0, "rewards/rejected": 4.574962615966797, "step": 4001 }, { "epoch": 0.89, "learning_rate": 6.157987846964432e-06, "logits/chosen": -1.5743755102157593, "logits/rejected": -1.497462511062622, "logps/chosen": -16.45306396484375, "logps/rejected": -14.45088005065918, "loss": 0.4557, "rewards/accuracies": 1.0, "rewards/chosen": 1.5334848165512085, "rewards/margins": 1.0126069784164429, "rewards/rejected": 0.5208778381347656, "step": 4002 }, { "epoch": 0.89, "learning_rate": 6.156244174913604e-06, "logits/chosen": -1.5334999561309814, "logits/rejected": -1.385453462600708, "logps/chosen": -147.84722900390625, "logps/rejected": -29.66783332824707, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 6.2439117431640625, "rewards/margins": 4.218339920043945, "rewards/rejected": 2.0255720615386963, "step": 4003 }, { "epoch": 0.89, "learning_rate": 6.154500354288181e-06, "logits/chosen": -1.2954769134521484, "logits/rejected": -1.3233211040496826, "logps/chosen": -32.17642593383789, "logps/rejected": -54.08597946166992, "loss": 0.8614, "rewards/accuracies": 0.0, "rewards/chosen": 2.1293885707855225, "rewards/margins": -0.49306201934814453, "rewards/rejected": 2.622450590133667, "step": 4004 }, { "epoch": 0.89, "learning_rate": 6.152756385312235e-06, "logits/chosen": -1.9624757766723633, "logits/rejected": -1.9155728816986084, "logps/chosen": -76.95147705078125, "logps/rejected": -63.525970458984375, "loss": 1.5827, "rewards/accuracies": 1.0, "rewards/chosen": 7.738772869110107, "rewards/margins": 4.995303630828857, "rewards/rejected": 2.74346923828125, "step": 4005 }, { "epoch": 0.89, "learning_rate": 6.151012268209866e-06, "logits/chosen": -1.7982982397079468, "logits/rejected": -1.6780449151992798, "logps/chosen": -86.46443939208984, "logps/rejected": -25.468730926513672, "loss": 0.1795, "rewards/accuracies": 1.0, "rewards/chosen": 2.327963352203369, "rewards/margins": 1.6279454231262207, "rewards/rejected": 0.7000179290771484, "step": 4006 }, { "epoch": 0.89, "learning_rate": 6.1492680032051885e-06, "logits/chosen": -1.681573748588562, "logits/rejected": -1.6436271667480469, "logps/chosen": -38.38450622558594, "logps/rejected": -46.94869613647461, "loss": 0.9231, "rewards/accuracies": 0.0, "rewards/chosen": 1.5908126831054688, "rewards/margins": -0.9297091960906982, "rewards/rejected": 2.520521879196167, "step": 4007 }, { "epoch": 0.89, "learning_rate": 6.147523590522334e-06, "logits/chosen": -1.6455082893371582, "logits/rejected": -1.6205934286117554, "logps/chosen": -58.926429748535156, "logps/rejected": -53.202781677246094, "loss": 0.2474, "rewards/accuracies": 1.0, "rewards/chosen": 3.130300283432007, "rewards/margins": 0.4832627773284912, "rewards/rejected": 2.6470375061035156, "step": 4008 }, { "epoch": 0.89, "learning_rate": 6.145779030385457e-06, "logits/chosen": -1.5127085447311401, "logits/rejected": -1.5191422700881958, "logps/chosen": -52.37210464477539, "logps/rejected": -60.04889678955078, "loss": 0.9871, "rewards/accuracies": 0.0, "rewards/chosen": 2.4904630184173584, "rewards/margins": -1.7514684200286865, "rewards/rejected": 4.241931438446045, "step": 4009 }, { "epoch": 0.89, "learning_rate": 6.144034323018728e-06, "logits/chosen": -1.7392672300338745, "logits/rejected": -1.7364505529403687, "logps/chosen": -68.92823791503906, "logps/rejected": -83.95301818847656, "loss": 0.2851, "rewards/accuracies": 1.0, "rewards/chosen": 3.3465752601623535, "rewards/margins": 0.7134759426116943, "rewards/rejected": 2.633099317550659, "step": 4010 }, { "epoch": 0.89, "learning_rate": 6.1422894686463395e-06, "logits/chosen": -1.4362484216690063, "logits/rejected": -1.2476743459701538, "logps/chosen": -114.08696746826172, "logps/rejected": -91.25831604003906, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": 5.667539119720459, "rewards/margins": 2.9254095554351807, "rewards/rejected": 2.7421295642852783, "step": 4011 }, { "epoch": 0.89, "learning_rate": 6.140544467492502e-06, "logits/chosen": -1.6912877559661865, "logits/rejected": -1.6436059474945068, "logps/chosen": -45.90909194946289, "logps/rejected": -50.489566802978516, "loss": 0.4869, "rewards/accuracies": 1.0, "rewards/chosen": 3.6777782440185547, "rewards/margins": 1.126708984375, "rewards/rejected": 2.5510692596435547, "step": 4012 }, { "epoch": 0.89, "learning_rate": 6.13879931978144e-06, "logits/chosen": -1.681463599205017, "logits/rejected": -1.7070573568344116, "logps/chosen": -82.31632232666016, "logps/rejected": -47.828208923339844, "loss": 1.3216, "rewards/accuracies": 0.0, "rewards/chosen": 3.6473190784454346, "rewards/margins": -0.8111860752105713, "rewards/rejected": 4.458505153656006, "step": 4013 }, { "epoch": 0.89, "learning_rate": 6.137054025737404e-06, "logits/chosen": -1.513847827911377, "logits/rejected": -1.537887454032898, "logps/chosen": -57.786460876464844, "logps/rejected": -88.82308197021484, "loss": 2.4981, "rewards/accuracies": 0.0, "rewards/chosen": 2.323992967605591, "rewards/margins": -4.985431671142578, "rewards/rejected": 7.309424877166748, "step": 4014 }, { "epoch": 0.89, "learning_rate": 6.135308585584657e-06, "logits/chosen": -1.6361043453216553, "logits/rejected": -1.6361043453216553, "logps/chosen": -35.10149383544922, "logps/rejected": -35.10149383544922, "loss": 0.4265, "rewards/accuracies": 0.0, "rewards/chosen": 2.0941002368927, "rewards/margins": 0.0, "rewards/rejected": 2.0941002368927, "step": 4015 }, { "epoch": 0.89, "learning_rate": 6.1335629995474875e-06, "logits/chosen": -1.5732742547988892, "logits/rejected": -1.5732742547988892, "logps/chosen": -6.037571907043457, "logps/rejected": -6.037571907043457, "loss": 0.3714, "rewards/accuracies": 0.0, "rewards/chosen": 1.7730270624160767, "rewards/margins": 0.0, "rewards/rejected": 1.7730270624160767, "step": 4016 }, { "epoch": 0.89, "learning_rate": 6.131817267850198e-06, "logits/chosen": -1.9398047924041748, "logits/rejected": -1.9044616222381592, "logps/chosen": -146.6904296875, "logps/rejected": -50.02843475341797, "loss": 0.1573, "rewards/accuracies": 1.0, "rewards/chosen": 3.708491563796997, "rewards/margins": 1.1049048900604248, "rewards/rejected": 2.6035866737365723, "step": 4017 }, { "epoch": 0.89, "learning_rate": 6.1300713907171105e-06, "logits/chosen": -1.6446696519851685, "logits/rejected": -1.5231841802597046, "logps/chosen": -57.303802490234375, "logps/rejected": -30.182632446289062, "loss": 0.1221, "rewards/accuracies": 1.0, "rewards/chosen": 5.360759258270264, "rewards/margins": 1.6883838176727295, "rewards/rejected": 3.672375440597534, "step": 4018 }, { "epoch": 0.89, "learning_rate": 6.128325368372565e-06, "logits/chosen": -1.7076209783554077, "logits/rejected": -1.7005202770233154, "logps/chosen": -61.68828582763672, "logps/rejected": -68.95852661132812, "loss": 0.6038, "rewards/accuracies": 0.0, "rewards/chosen": 2.40470814704895, "rewards/margins": -0.22672510147094727, "rewards/rejected": 2.6314332485198975, "step": 4019 }, { "epoch": 0.89, "learning_rate": 6.126579201040923e-06, "logits/chosen": -1.7226588726043701, "logits/rejected": -1.7092565298080444, "logps/chosen": -106.37256622314453, "logps/rejected": -50.552772521972656, "loss": 0.3796, "rewards/accuracies": 1.0, "rewards/chosen": 5.76033878326416, "rewards/margins": 2.6851181983947754, "rewards/rejected": 3.0752205848693848, "step": 4020 }, { "epoch": 0.89, "learning_rate": 6.124832888946563e-06, "logits/chosen": -1.4802814722061157, "logits/rejected": -1.5181093215942383, "logps/chosen": -30.065170288085938, "logps/rejected": -22.656726837158203, "loss": 0.8131, "rewards/accuracies": 1.0, "rewards/chosen": 1.6050728559494019, "rewards/margins": 0.08013653755187988, "rewards/rejected": 1.524936318397522, "step": 4021 }, { "epoch": 0.89, "learning_rate": 6.123086432313878e-06, "logits/chosen": -1.3818033933639526, "logits/rejected": -1.2934662103652954, "logps/chosen": -67.88367462158203, "logps/rejected": -52.77435302734375, "loss": 2.4686, "rewards/accuracies": 1.0, "rewards/chosen": 6.704164981842041, "rewards/margins": 5.21145486831665, "rewards/rejected": 1.4927101135253906, "step": 4022 }, { "epoch": 0.89, "learning_rate": 6.121339831367288e-06, "logits/chosen": -1.7235074043273926, "logits/rejected": -1.7631868124008179, "logps/chosen": -75.94512939453125, "logps/rejected": -85.07984161376953, "loss": 4.6543, "rewards/accuracies": 0.0, "rewards/chosen": 2.07303786277771, "rewards/margins": -3.8022239208221436, "rewards/rejected": 5.8752617835998535, "step": 4023 }, { "epoch": 0.89, "learning_rate": 6.119593086331225e-06, "logits/chosen": -1.4970862865447998, "logits/rejected": -1.2357537746429443, "logps/chosen": -78.6773681640625, "logps/rejected": -24.4997615814209, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 5.142103672027588, "rewards/margins": 5.054297924041748, "rewards/rejected": 0.08780556172132492, "step": 4024 }, { "epoch": 0.89, "learning_rate": 6.117846197430141e-06, "logits/chosen": -1.649560570716858, "logits/rejected": -1.6490086317062378, "logps/chosen": -65.32866668701172, "logps/rejected": -56.50087356567383, "loss": 1.5137, "rewards/accuracies": 0.0, "rewards/chosen": 1.2246147394180298, "rewards/margins": -1.5206218957901, "rewards/rejected": 2.74523663520813, "step": 4025 }, { "epoch": 0.89, "learning_rate": 6.1160991648885085e-06, "logits/chosen": -1.744736909866333, "logits/rejected": -1.7687615156173706, "logps/chosen": -108.45152282714844, "logps/rejected": -157.94259643554688, "loss": 1.1151, "rewards/accuracies": 0.0, "rewards/chosen": 5.581639289855957, "rewards/margins": -2.0404419898986816, "rewards/rejected": 7.622081279754639, "step": 4026 }, { "epoch": 0.89, "learning_rate": 6.114351988930816e-06, "logits/chosen": -1.5584748983383179, "logits/rejected": -1.5090101957321167, "logps/chosen": -137.12030029296875, "logps/rejected": -44.18941879272461, "loss": 0.1702, "rewards/accuracies": 1.0, "rewards/chosen": 5.527078151702881, "rewards/margins": 4.472663402557373, "rewards/rejected": 1.0544147491455078, "step": 4027 }, { "epoch": 0.89, "learning_rate": 6.112604669781572e-06, "logits/chosen": -1.5347670316696167, "logits/rejected": -1.4678640365600586, "logps/chosen": -121.28856658935547, "logps/rejected": -65.78765869140625, "loss": 0.1724, "rewards/accuracies": 1.0, "rewards/chosen": 4.902835845947266, "rewards/margins": 1.621034860610962, "rewards/rejected": 3.2818009853363037, "step": 4028 }, { "epoch": 0.89, "learning_rate": 6.1108572076653026e-06, "logits/chosen": -1.7514328956604004, "logits/rejected": -1.8267940282821655, "logps/chosen": -74.65821838378906, "logps/rejected": -119.90821838378906, "loss": 3.225, "rewards/accuracies": 0.0, "rewards/chosen": 1.4493331909179688, "rewards/margins": -6.436654567718506, "rewards/rejected": 7.885987758636475, "step": 4029 }, { "epoch": 0.89, "learning_rate": 6.1091096028065545e-06, "logits/chosen": -1.595752239227295, "logits/rejected": -1.5681757926940918, "logps/chosen": -44.2378044128418, "logps/rejected": -64.23374938964844, "loss": 3.5276, "rewards/accuracies": 0.0, "rewards/chosen": 2.4277637004852295, "rewards/margins": -0.06269645690917969, "rewards/rejected": 2.490460157394409, "step": 4030 }, { "epoch": 0.89, "learning_rate": 6.1073618554298866e-06, "logits/chosen": -1.757677435874939, "logits/rejected": -1.7190874814987183, "logps/chosen": -67.38489532470703, "logps/rejected": -60.270973205566406, "loss": 1.3796, "rewards/accuracies": 0.0, "rewards/chosen": 2.8435142040252686, "rewards/margins": -1.7573306560516357, "rewards/rejected": 4.600844860076904, "step": 4031 }, { "epoch": 0.89, "learning_rate": 6.105613965759881e-06, "logits/chosen": -1.8083431720733643, "logits/rejected": -1.6800662279129028, "logps/chosen": -68.25863647460938, "logps/rejected": -22.0069637298584, "loss": 0.1006, "rewards/accuracies": 1.0, "rewards/chosen": 3.6229844093322754, "rewards/margins": 1.7758688926696777, "rewards/rejected": 1.8471155166625977, "step": 4032 }, { "epoch": 0.89, "learning_rate": 6.103865934021141e-06, "logits/chosen": -1.8031785488128662, "logits/rejected": -1.8108360767364502, "logps/chosen": -73.38978576660156, "logps/rejected": -49.087799072265625, "loss": 1.2366, "rewards/accuracies": 1.0, "rewards/chosen": 3.13999342918396, "rewards/margins": 0.515049934387207, "rewards/rejected": 2.624943494796753, "step": 4033 }, { "epoch": 0.89, "learning_rate": 6.10211776043828e-06, "logits/chosen": -1.9185476303100586, "logits/rejected": -1.923075556755066, "logps/chosen": -50.503353118896484, "logps/rejected": -60.454593658447266, "loss": 0.7912, "rewards/accuracies": 0.0, "rewards/chosen": 2.5667636394500732, "rewards/margins": -0.47649312019348145, "rewards/rejected": 3.0432567596435547, "step": 4034 }, { "epoch": 0.89, "learning_rate": 6.100369445235938e-06, "logits/chosen": -1.7524023056030273, "logits/rejected": -1.6998831033706665, "logps/chosen": -129.72459411621094, "logps/rejected": -106.60234069824219, "loss": 0.6179, "rewards/accuracies": 1.0, "rewards/chosen": 7.551560878753662, "rewards/margins": 1.3612442016601562, "rewards/rejected": 6.190316677093506, "step": 4035 }, { "epoch": 0.89, "learning_rate": 6.098620988638766e-06, "logits/chosen": -1.7500134706497192, "logits/rejected": -1.7770054340362549, "logps/chosen": -120.12284851074219, "logps/rejected": -70.04273986816406, "loss": 0.1554, "rewards/accuracies": 1.0, "rewards/chosen": 7.086434841156006, "rewards/margins": 2.2702460289001465, "rewards/rejected": 4.816188812255859, "step": 4036 }, { "epoch": 0.89, "learning_rate": 6.09687239087144e-06, "logits/chosen": -1.5110044479370117, "logits/rejected": -1.5474460124969482, "logps/chosen": -48.44221496582031, "logps/rejected": -64.02409362792969, "loss": 0.1893, "rewards/accuracies": 1.0, "rewards/chosen": 4.0668840408325195, "rewards/margins": 0.8078515529632568, "rewards/rejected": 3.2590324878692627, "step": 4037 }, { "epoch": 0.89, "learning_rate": 6.095123652158648e-06, "logits/chosen": -1.771360158920288, "logits/rejected": -1.751237154006958, "logps/chosen": -92.07841491699219, "logps/rejected": -45.73387908935547, "loss": 0.3526, "rewards/accuracies": 1.0, "rewards/chosen": 4.0553741455078125, "rewards/margins": 1.9184019565582275, "rewards/rejected": 2.136972188949585, "step": 4038 }, { "epoch": 0.89, "learning_rate": 6.093374772725098e-06, "logits/chosen": -1.4923162460327148, "logits/rejected": -1.5371119976043701, "logps/chosen": -58.227970123291016, "logps/rejected": -113.11959075927734, "loss": 2.1502, "rewards/accuracies": 0.0, "rewards/chosen": 2.863403797149658, "rewards/margins": -3.9149770736694336, "rewards/rejected": 6.778380870819092, "step": 4039 }, { "epoch": 0.89, "learning_rate": 6.0916257527955194e-06, "logits/chosen": -1.3318983316421509, "logits/rejected": -1.293246865272522, "logps/chosen": -6.807072639465332, "logps/rejected": -16.61915397644043, "loss": 0.2783, "rewards/accuracies": 1.0, "rewards/chosen": 0.8047910928726196, "rewards/margins": 0.2988271713256836, "rewards/rejected": 0.505963921546936, "step": 4040 }, { "epoch": 0.89, "learning_rate": 6.089876592594655e-06, "logits/chosen": -1.68182373046875, "logits/rejected": -1.6350560188293457, "logps/chosen": -43.91500473022461, "logps/rejected": -49.56327819824219, "loss": 0.7565, "rewards/accuracies": 0.0, "rewards/chosen": 1.7903698682785034, "rewards/margins": -0.32299602031707764, "rewards/rejected": 2.113365888595581, "step": 4041 }, { "epoch": 0.89, "learning_rate": 6.088127292347268e-06, "logits/chosen": -1.9446563720703125, "logits/rejected": -1.9238983392715454, "logps/chosen": -93.22787475585938, "logps/rejected": -46.954864501953125, "loss": 0.0578, "rewards/accuracies": 1.0, "rewards/chosen": 4.9391937255859375, "rewards/margins": 2.141885995864868, "rewards/rejected": 2.7973077297210693, "step": 4042 }, { "epoch": 0.89, "learning_rate": 6.086377852278141e-06, "logits/chosen": -1.766603946685791, "logits/rejected": -1.6897672414779663, "logps/chosen": -51.72142028808594, "logps/rejected": -62.98603057861328, "loss": 0.1725, "rewards/accuracies": 1.0, "rewards/chosen": 5.360745906829834, "rewards/margins": 3.8208084106445312, "rewards/rejected": 1.5399376153945923, "step": 4043 }, { "epoch": 0.9, "learning_rate": 6.08462827261207e-06, "logits/chosen": -1.7966641187667847, "logits/rejected": -1.7490806579589844, "logps/chosen": -60.356300354003906, "logps/rejected": -11.873908996582031, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 3.769063711166382, "rewards/margins": 3.4069230556488037, "rewards/rejected": 0.3621406555175781, "step": 4044 }, { "epoch": 0.9, "learning_rate": 6.082878553573874e-06, "logits/chosen": -1.512316107749939, "logits/rejected": -1.474721074104309, "logps/chosen": -62.317527770996094, "logps/rejected": -75.95925903320312, "loss": 0.3009, "rewards/accuracies": 1.0, "rewards/chosen": 3.265923261642456, "rewards/margins": 0.736807107925415, "rewards/rejected": 2.529116153717041, "step": 4045 }, { "epoch": 0.9, "learning_rate": 6.081128695388387e-06, "logits/chosen": -1.6778509616851807, "logits/rejected": -1.6517906188964844, "logps/chosen": -72.6849136352539, "logps/rejected": -54.3458251953125, "loss": 0.7833, "rewards/accuracies": 1.0, "rewards/chosen": 4.065976619720459, "rewards/margins": 1.1729390621185303, "rewards/rejected": 2.8930375576019287, "step": 4046 }, { "epoch": 0.9, "learning_rate": 6.079378698280463e-06, "logits/chosen": -1.6496412754058838, "logits/rejected": -1.6447559595108032, "logps/chosen": -59.807289123535156, "logps/rejected": -88.54671478271484, "loss": 0.5434, "rewards/accuracies": 0.0, "rewards/chosen": 5.627335548400879, "rewards/margins": -0.6313948631286621, "rewards/rejected": 6.258730411529541, "step": 4047 }, { "epoch": 0.9, "learning_rate": 6.07762856247497e-06, "logits/chosen": -1.7985646724700928, "logits/rejected": -1.8110146522521973, "logps/chosen": -63.38577651977539, "logps/rejected": -75.62410736083984, "loss": 0.9786, "rewards/accuracies": 0.0, "rewards/chosen": 2.953679323196411, "rewards/margins": -1.8011953830718994, "rewards/rejected": 4.7548747062683105, "step": 4048 }, { "epoch": 0.9, "learning_rate": 6.075878288196797e-06, "logits/chosen": -1.8586806058883667, "logits/rejected": -1.7939881086349487, "logps/chosen": -100.29109954833984, "logps/rejected": -127.22772979736328, "loss": 1.6072, "rewards/accuracies": 0.0, "rewards/chosen": 6.730188846588135, "rewards/margins": -0.08748912811279297, "rewards/rejected": 6.817677974700928, "step": 4049 }, { "epoch": 0.9, "learning_rate": 6.07412787567085e-06, "logits/chosen": -1.8351625204086304, "logits/rejected": -1.7889881134033203, "logps/chosen": -119.33645629882812, "logps/rejected": -57.632041931152344, "loss": 0.0627, "rewards/accuracies": 1.0, "rewards/chosen": 6.706584453582764, "rewards/margins": 3.4752726554870605, "rewards/rejected": 3.231311798095703, "step": 4050 }, { "epoch": 0.9, "learning_rate": 6.072377325122053e-06, "logits/chosen": -1.718837857246399, "logits/rejected": -1.7287707328796387, "logps/chosen": -56.77684020996094, "logps/rejected": -100.4741439819336, "loss": 1.4343, "rewards/accuracies": 0.0, "rewards/chosen": 2.7072417736053467, "rewards/margins": -2.576258420944214, "rewards/rejected": 5.2835001945495605, "step": 4051 }, { "epoch": 0.9, "learning_rate": 6.070626636775349e-06, "logits/chosen": -1.4638298749923706, "logits/rejected": -1.2866342067718506, "logps/chosen": -95.19506072998047, "logps/rejected": -28.222915649414062, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": 7.035471439361572, "rewards/margins": 5.731360912322998, "rewards/rejected": 1.3041104078292847, "step": 4052 }, { "epoch": 0.9, "learning_rate": 6.0688758108556944e-06, "logits/chosen": -1.4676696062088013, "logits/rejected": -1.4221926927566528, "logps/chosen": -75.35569763183594, "logps/rejected": -122.00637817382812, "loss": 2.2927, "rewards/accuracies": 0.0, "rewards/chosen": 5.487184047698975, "rewards/margins": -4.544212818145752, "rewards/rejected": 10.031396865844727, "step": 4053 }, { "epoch": 0.9, "learning_rate": 6.067124847588069e-06, "logits/chosen": -1.4809104204177856, "logits/rejected": -1.5081509351730347, "logps/chosen": -193.41049194335938, "logps/rejected": -155.90292358398438, "loss": 2.9253, "rewards/accuracies": 0.0, "rewards/chosen": 7.532815456390381, "rewards/margins": -4.821378231048584, "rewards/rejected": 12.354193687438965, "step": 4054 }, { "epoch": 0.9, "learning_rate": 6.065373747197465e-06, "logits/chosen": -1.584973931312561, "logits/rejected": -1.6029572486877441, "logps/chosen": -78.80990600585938, "logps/rejected": -108.13632202148438, "loss": 2.3813, "rewards/accuracies": 0.0, "rewards/chosen": 3.077815294265747, "rewards/margins": -4.749345779418945, "rewards/rejected": 7.827160835266113, "step": 4055 }, { "epoch": 0.9, "learning_rate": 6.063622509908895e-06, "logits/chosen": -1.8885959386825562, "logits/rejected": -1.8808621168136597, "logps/chosen": -74.2792739868164, "logps/rejected": -66.69786071777344, "loss": 0.4116, "rewards/accuracies": 1.0, "rewards/chosen": 5.321946620941162, "rewards/margins": 1.7388455867767334, "rewards/rejected": 3.5831010341644287, "step": 4056 }, { "epoch": 0.9, "learning_rate": 6.061871135947391e-06, "logits/chosen": -1.5778242349624634, "logits/rejected": -1.5350614786148071, "logps/chosen": -35.64306640625, "logps/rejected": -37.96934509277344, "loss": 1.2711, "rewards/accuracies": 0.0, "rewards/chosen": 2.4893906116485596, "rewards/margins": -2.246652841567993, "rewards/rejected": 4.736043453216553, "step": 4057 }, { "epoch": 0.9, "learning_rate": 6.060119625537998e-06, "logits/chosen": -1.6631815433502197, "logits/rejected": -1.7028483152389526, "logps/chosen": -41.34272384643555, "logps/rejected": -58.67629623413086, "loss": 1.6307, "rewards/accuracies": 0.0, "rewards/chosen": 1.6102246046066284, "rewards/margins": -2.7505288124084473, "rewards/rejected": 4.360753536224365, "step": 4058 }, { "epoch": 0.9, "learning_rate": 6.0583679789057814e-06, "logits/chosen": -1.5180563926696777, "logits/rejected": -1.4685308933258057, "logps/chosen": -36.02882385253906, "logps/rejected": -33.773834228515625, "loss": 0.1621, "rewards/accuracies": 1.0, "rewards/chosen": 2.449382781982422, "rewards/margins": 1.0334266424179077, "rewards/rejected": 1.4159561395645142, "step": 4059 }, { "epoch": 0.9, "learning_rate": 6.056616196275824e-06, "logits/chosen": -1.8024576902389526, "logits/rejected": -1.687221646308899, "logps/chosen": -59.60481262207031, "logps/rejected": -28.402875900268555, "loss": 0.6515, "rewards/accuracies": 1.0, "rewards/chosen": 2.970079183578491, "rewards/margins": 2.102741241455078, "rewards/rejected": 0.8673380017280579, "step": 4060 }, { "epoch": 0.9, "learning_rate": 6.054864277873225e-06, "logits/chosen": -1.297074794769287, "logits/rejected": -1.297074794769287, "logps/chosen": -37.26188278198242, "logps/rejected": -37.26188278198242, "loss": 0.68, "rewards/accuracies": 0.0, "rewards/chosen": 0.6796070337295532, "rewards/margins": 0.0, "rewards/rejected": 0.6796070337295532, "step": 4061 }, { "epoch": 0.9, "learning_rate": 6.053112223923102e-06, "logits/chosen": -1.9062966108322144, "logits/rejected": -1.890226125717163, "logps/chosen": -42.80507278442383, "logps/rejected": -32.566497802734375, "loss": 1.367, "rewards/accuracies": 1.0, "rewards/chosen": 1.735484004020691, "rewards/margins": 0.017322182655334473, "rewards/rejected": 1.7181618213653564, "step": 4062 }, { "epoch": 0.9, "learning_rate": 6.051360034650591e-06, "logits/chosen": -1.7298481464385986, "logits/rejected": -1.8039263486862183, "logps/chosen": -44.60197448730469, "logps/rejected": -147.9942169189453, "loss": 3.8451, "rewards/accuracies": 0.0, "rewards/chosen": 3.102102756500244, "rewards/margins": -6.849117755889893, "rewards/rejected": 9.951220512390137, "step": 4063 }, { "epoch": 0.9, "learning_rate": 6.049607710280841e-06, "logits/chosen": -1.5886393785476685, "logits/rejected": -1.5743194818496704, "logps/chosen": -135.22500610351562, "logps/rejected": -29.00937271118164, "loss": 0.8927, "rewards/accuracies": 1.0, "rewards/chosen": 6.1869049072265625, "rewards/margins": 1.2742090225219727, "rewards/rejected": 4.91269588470459, "step": 4064 }, { "epoch": 0.9, "learning_rate": 6.047855251039025e-06, "logits/chosen": -1.8551788330078125, "logits/rejected": -1.8176078796386719, "logps/chosen": -136.11688232421875, "logps/rejected": -55.74610137939453, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": 6.150933742523193, "rewards/margins": 3.0609023571014404, "rewards/rejected": 3.090031385421753, "step": 4065 }, { "epoch": 0.9, "learning_rate": 6.046102657150328e-06, "logits/chosen": -1.6664507389068604, "logits/rejected": -1.6716269254684448, "logps/chosen": -90.89605712890625, "logps/rejected": -144.02743530273438, "loss": 0.2358, "rewards/accuracies": 1.0, "rewards/chosen": 7.207342624664307, "rewards/margins": 0.5120134353637695, "rewards/rejected": 6.695329189300537, "step": 4066 }, { "epoch": 0.9, "learning_rate": 6.044349928839953e-06, "logits/chosen": -1.7520965337753296, "logits/rejected": -1.7268908023834229, "logps/chosen": -60.16173553466797, "logps/rejected": -39.98212432861328, "loss": 0.2016, "rewards/accuracies": 1.0, "rewards/chosen": 3.5513992309570312, "rewards/margins": 1.935082197189331, "rewards/rejected": 1.6163170337677002, "step": 4067 }, { "epoch": 0.9, "learning_rate": 6.042597066333124e-06, "logits/chosen": -1.4784200191497803, "logits/rejected": -1.396166205406189, "logps/chosen": -96.59349060058594, "logps/rejected": -66.29591369628906, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": 6.460404872894287, "rewards/margins": 4.021605491638184, "rewards/rejected": 2.4387993812561035, "step": 4068 }, { "epoch": 0.9, "learning_rate": 6.040844069855079e-06, "logits/chosen": -1.6356842517852783, "logits/rejected": -1.6011298894882202, "logps/chosen": -54.61431121826172, "logps/rejected": -34.313941955566406, "loss": 0.4257, "rewards/accuracies": 0.0, "rewards/chosen": 1.9322227239608765, "rewards/margins": -0.09014403820037842, "rewards/rejected": 2.022366762161255, "step": 4069 }, { "epoch": 0.9, "learning_rate": 6.039090939631072e-06, "logits/chosen": -1.9758975505828857, "logits/rejected": -1.769496202468872, "logps/chosen": -87.10566711425781, "logps/rejected": -127.732666015625, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 7.046136379241943, "rewards/margins": 8.417454719543457, "rewards/rejected": -1.3713181018829346, "step": 4070 }, { "epoch": 0.9, "learning_rate": 6.037337675886379e-06, "logits/chosen": -1.8453333377838135, "logits/rejected": -1.8015177249908447, "logps/chosen": -95.8541030883789, "logps/rejected": -74.22309875488281, "loss": 0.5305, "rewards/accuracies": 0.0, "rewards/chosen": 4.899281978607178, "rewards/margins": -0.5447244644165039, "rewards/rejected": 5.444006443023682, "step": 4071 }, { "epoch": 0.9, "learning_rate": 6.035584278846286e-06, "logits/chosen": -1.7966406345367432, "logits/rejected": -1.6299700736999512, "logps/chosen": -138.48223876953125, "logps/rejected": -33.204261779785156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 7.508177280426025, "rewards/margins": 7.0677103996276855, "rewards/rejected": 0.4404667019844055, "step": 4072 }, { "epoch": 0.9, "learning_rate": 6.033830748736104e-06, "logits/chosen": -1.6971527338027954, "logits/rejected": -1.657327651977539, "logps/chosen": -37.55921936035156, "logps/rejected": -49.27420425415039, "loss": 0.7506, "rewards/accuracies": 0.0, "rewards/chosen": 2.894789218902588, "rewards/margins": -1.2462124824523926, "rewards/rejected": 4.1410017013549805, "step": 4073 }, { "epoch": 0.9, "learning_rate": 6.032077085781154e-06, "logits/chosen": -1.719603180885315, "logits/rejected": -1.7345201969146729, "logps/chosen": -99.91496276855469, "logps/rejected": -55.4867057800293, "loss": 0.2735, "rewards/accuracies": 1.0, "rewards/chosen": 3.4180374145507812, "rewards/margins": 0.3239772319793701, "rewards/rejected": 3.094060182571411, "step": 4074 }, { "epoch": 0.9, "learning_rate": 6.030323290206781e-06, "logits/chosen": -1.301888108253479, "logits/rejected": -1.262473225593567, "logps/chosen": -48.683074951171875, "logps/rejected": -59.078495025634766, "loss": 0.7188, "rewards/accuracies": 1.0, "rewards/chosen": 4.691159725189209, "rewards/margins": 1.9631319046020508, "rewards/rejected": 2.728027820587158, "step": 4075 }, { "epoch": 0.9, "learning_rate": 6.028569362238342e-06, "logits/chosen": -1.8196779489517212, "logits/rejected": -1.7519713640213013, "logps/chosen": -40.65840148925781, "logps/rejected": -18.991539001464844, "loss": 0.1452, "rewards/accuracies": 1.0, "rewards/chosen": 2.711897373199463, "rewards/margins": 1.1449509859085083, "rewards/rejected": 1.5669463872909546, "step": 4076 }, { "epoch": 0.9, "learning_rate": 6.026815302101212e-06, "logits/chosen": -1.7216401100158691, "logits/rejected": -1.720634937286377, "logps/chosen": -48.228084564208984, "logps/rejected": -37.631996154785156, "loss": 0.0836, "rewards/accuracies": 1.0, "rewards/chosen": 5.25879430770874, "rewards/margins": 1.727113962173462, "rewards/rejected": 3.5316803455352783, "step": 4077 }, { "epoch": 0.9, "learning_rate": 6.025061110020784e-06, "logits/chosen": -1.7124156951904297, "logits/rejected": -1.6317236423492432, "logps/chosen": -87.78657531738281, "logps/rejected": -59.53028869628906, "loss": 0.1089, "rewards/accuracies": 1.0, "rewards/chosen": 6.116620063781738, "rewards/margins": 3.843881368637085, "rewards/rejected": 2.2727386951446533, "step": 4078 }, { "epoch": 0.9, "learning_rate": 6.023306786222469e-06, "logits/chosen": -1.843370795249939, "logits/rejected": -1.813869833946228, "logps/chosen": -41.327728271484375, "logps/rejected": -60.362030029296875, "loss": 0.7834, "rewards/accuracies": 0.0, "rewards/chosen": 3.5393340587615967, "rewards/margins": -0.3080735206604004, "rewards/rejected": 3.847407579421997, "step": 4079 }, { "epoch": 0.9, "learning_rate": 6.021552330931693e-06, "logits/chosen": -1.5370763540267944, "logits/rejected": -1.5370800495147705, "logps/chosen": -50.10661697387695, "logps/rejected": -66.8198013305664, "loss": 1.0357, "rewards/accuracies": 0.0, "rewards/chosen": 2.9824140071868896, "rewards/margins": -1.7365968227386475, "rewards/rejected": 4.719010829925537, "step": 4080 }, { "epoch": 0.9, "learning_rate": 6.019797744373896e-06, "logits/chosen": -1.5093449354171753, "logits/rejected": -1.4148589372634888, "logps/chosen": -71.61753845214844, "logps/rejected": -52.0778923034668, "loss": 0.1318, "rewards/accuracies": 1.0, "rewards/chosen": 3.778170108795166, "rewards/margins": 1.2676167488098145, "rewards/rejected": 2.5105533599853516, "step": 4081 }, { "epoch": 0.9, "learning_rate": 6.01804302677454e-06, "logits/chosen": -1.5362788438796997, "logits/rejected": -1.5277637243270874, "logps/chosen": -64.61290740966797, "logps/rejected": -44.12192916870117, "loss": 0.3912, "rewards/accuracies": 0.0, "rewards/chosen": 3.018496036529541, "rewards/margins": -0.006392955780029297, "rewards/rejected": 3.0248889923095703, "step": 4082 }, { "epoch": 0.9, "learning_rate": 6.0162881783591034e-06, "logits/chosen": -1.9577242136001587, "logits/rejected": -1.9010173082351685, "logps/chosen": -81.92594146728516, "logps/rejected": -102.71815490722656, "loss": 1.3758, "rewards/accuracies": 0.0, "rewards/chosen": 5.8228936195373535, "rewards/margins": -2.4857382774353027, "rewards/rejected": 8.308631896972656, "step": 4083 }, { "epoch": 0.9, "learning_rate": 6.0145331993530785e-06, "logits/chosen": -1.9053095579147339, "logits/rejected": -1.812605857849121, "logps/chosen": -121.70801544189453, "logps/rejected": -39.947208404541016, "loss": 0.4125, "rewards/accuracies": 1.0, "rewards/chosen": 3.934157609939575, "rewards/margins": 3.06548810005188, "rewards/rejected": 0.8686695098876953, "step": 4084 }, { "epoch": 0.9, "learning_rate": 6.012778089981976e-06, "logits/chosen": -1.8507195711135864, "logits/rejected": -1.769337773323059, "logps/chosen": -109.86788940429688, "logps/rejected": -17.42832374572754, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 6.531292915344238, "rewards/margins": 6.137264251708984, "rewards/rejected": 0.3940288722515106, "step": 4085 }, { "epoch": 0.9, "learning_rate": 6.011022850471324e-06, "logits/chosen": -1.6119110584259033, "logits/rejected": -1.5469738245010376, "logps/chosen": -58.01182556152344, "logps/rejected": -20.12226104736328, "loss": 0.7533, "rewards/accuracies": 0.0, "rewards/chosen": 1.5334968566894531, "rewards/margins": -0.391953706741333, "rewards/rejected": 1.9254505634307861, "step": 4086 }, { "epoch": 0.9, "learning_rate": 6.009267481046667e-06, "logits/chosen": -1.516900658607483, "logits/rejected": -1.5233837366104126, "logps/chosen": -58.469482421875, "logps/rejected": -62.90354537963867, "loss": 1.5215, "rewards/accuracies": 0.0, "rewards/chosen": 2.4263665676116943, "rewards/margins": -0.5564043521881104, "rewards/rejected": 2.9827709197998047, "step": 4087 }, { "epoch": 0.9, "learning_rate": 6.007511981933564e-06, "logits/chosen": -1.6757348775863647, "logits/rejected": -1.7019007205963135, "logps/chosen": -78.2496566772461, "logps/rejected": -49.89582061767578, "loss": 0.9603, "rewards/accuracies": 0.0, "rewards/chosen": 2.317021131515503, "rewards/margins": -1.7474777698516846, "rewards/rejected": 4.0644989013671875, "step": 4088 }, { "epoch": 0.91, "learning_rate": 6.005756353357595e-06, "logits/chosen": -2.0036542415618896, "logits/rejected": -2.029759168624878, "logps/chosen": -29.117023468017578, "logps/rejected": -51.552730560302734, "loss": 2.2051, "rewards/accuracies": 0.0, "rewards/chosen": 2.7315940856933594, "rewards/margins": -2.8135008811950684, "rewards/rejected": 5.545094966888428, "step": 4089 }, { "epoch": 0.91, "learning_rate": 6.00400059554435e-06, "logits/chosen": -1.6302430629730225, "logits/rejected": -1.6317154169082642, "logps/chosen": -31.0781307220459, "logps/rejected": -54.0816650390625, "loss": 0.8329, "rewards/accuracies": 0.0, "rewards/chosen": 1.410841941833496, "rewards/margins": -1.302201271057129, "rewards/rejected": 2.713043212890625, "step": 4090 }, { "epoch": 0.91, "learning_rate": 6.0022447087194425e-06, "logits/chosen": -1.2647147178649902, "logits/rejected": -1.2622637748718262, "logps/chosen": -63.57293701171875, "logps/rejected": -48.10926818847656, "loss": 0.2363, "rewards/accuracies": 1.0, "rewards/chosen": 4.881043910980225, "rewards/margins": 0.5599761009216309, "rewards/rejected": 4.321067810058594, "step": 4091 }, { "epoch": 0.91, "learning_rate": 6.0004886931085e-06, "logits/chosen": -1.5439646244049072, "logits/rejected": -1.6809210777282715, "logps/chosen": -40.32887649536133, "logps/rejected": -126.70800018310547, "loss": 2.8181, "rewards/accuracies": 0.0, "rewards/chosen": 1.4485607147216797, "rewards/margins": -4.448171615600586, "rewards/rejected": 5.896732330322266, "step": 4092 }, { "epoch": 0.91, "learning_rate": 5.998732548937164e-06, "logits/chosen": -1.5416631698608398, "logits/rejected": -1.5368101596832275, "logps/chosen": -53.86893844604492, "logps/rejected": -69.71836853027344, "loss": 0.1165, "rewards/accuracies": 1.0, "rewards/chosen": 5.215351581573486, "rewards/margins": 2.6404507160186768, "rewards/rejected": 2.5749008655548096, "step": 4093 }, { "epoch": 0.91, "learning_rate": 5.996976276431097e-06, "logits/chosen": -1.761115550994873, "logits/rejected": -1.7848623991012573, "logps/chosen": -44.472225189208984, "logps/rejected": -62.237892150878906, "loss": 0.6344, "rewards/accuracies": 0.0, "rewards/chosen": 2.206017017364502, "rewards/margins": -0.7926173210144043, "rewards/rejected": 2.9986343383789062, "step": 4094 }, { "epoch": 0.91, "learning_rate": 5.9952198758159765e-06, "logits/chosen": -1.8046584129333496, "logits/rejected": -1.790596842765808, "logps/chosen": -28.59644317626953, "logps/rejected": -11.146697998046875, "loss": 0.3621, "rewards/accuracies": 1.0, "rewards/chosen": 3.72517466545105, "rewards/margins": 3.0334291458129883, "rewards/rejected": 0.691745400428772, "step": 4095 }, { "epoch": 0.91, "learning_rate": 5.993463347317494e-06, "logits/chosen": -1.5987862348556519, "logits/rejected": -1.6421492099761963, "logps/chosen": -56.5418815612793, "logps/rejected": -96.57820129394531, "loss": 0.5279, "rewards/accuracies": 0.0, "rewards/chosen": 3.1419055461883545, "rewards/margins": -0.5695950984954834, "rewards/rejected": 3.711500644683838, "step": 4096 }, { "epoch": 0.91, "learning_rate": 5.9917066911613585e-06, "logits/chosen": -1.4041446447372437, "logits/rejected": -1.4006563425064087, "logps/chosen": -17.28386116027832, "logps/rejected": -39.344886779785156, "loss": 0.8264, "rewards/accuracies": 1.0, "rewards/chosen": 1.2663332223892212, "rewards/margins": 0.6063825488090515, "rewards/rejected": 0.6599506735801697, "step": 4097 }, { "epoch": 0.91, "learning_rate": 5.989949907573298e-06, "logits/chosen": -1.839721918106079, "logits/rejected": -1.8879622220993042, "logps/chosen": -52.59440612792969, "logps/rejected": -121.27681732177734, "loss": 4.2363, "rewards/accuracies": 0.0, "rewards/chosen": 2.1114823818206787, "rewards/margins": -7.779003143310547, "rewards/rejected": 9.890485763549805, "step": 4098 }, { "epoch": 0.91, "learning_rate": 5.9881929967790544e-06, "logits/chosen": -1.8257787227630615, "logits/rejected": -1.8440698385238647, "logps/chosen": -110.17452239990234, "logps/rejected": -121.80741119384766, "loss": 0.8856, "rewards/accuracies": 1.0, "rewards/chosen": 6.2761759757995605, "rewards/margins": 0.8926959037780762, "rewards/rejected": 5.383480072021484, "step": 4099 }, { "epoch": 0.91, "learning_rate": 5.986435959004386e-06, "logits/chosen": -1.5086740255355835, "logits/rejected": -1.463637351989746, "logps/chosen": -49.549095153808594, "logps/rejected": -38.585384368896484, "loss": 0.8433, "rewards/accuracies": 0.0, "rewards/chosen": 3.33955454826355, "rewards/margins": -0.5310559272766113, "rewards/rejected": 3.870610475540161, "step": 4100 }, { "epoch": 0.91, "learning_rate": 5.984678794475067e-06, "logits/chosen": -1.8736625909805298, "logits/rejected": -1.8736625909805298, "logps/chosen": -62.9093017578125, "logps/rejected": -62.9093017578125, "loss": 1.2046, "rewards/accuracies": 0.0, "rewards/chosen": 2.598886251449585, "rewards/margins": 0.0, "rewards/rejected": 2.598886251449585, "step": 4101 }, { "epoch": 0.91, "learning_rate": 5.982921503416891e-06, "logits/chosen": -1.9747880697250366, "logits/rejected": -1.9507883787155151, "logps/chosen": -52.70958709716797, "logps/rejected": -49.729034423828125, "loss": 2.0763, "rewards/accuracies": 0.0, "rewards/chosen": 2.6399848461151123, "rewards/margins": -3.2767817974090576, "rewards/rejected": 5.91676664352417, "step": 4102 }, { "epoch": 0.91, "learning_rate": 5.9811640860556644e-06, "logits/chosen": -1.7008811235427856, "logits/rejected": -1.7374188899993896, "logps/chosen": -47.271427154541016, "logps/rejected": -74.15464782714844, "loss": 1.48, "rewards/accuracies": 0.0, "rewards/chosen": 3.502342700958252, "rewards/margins": -2.702474594116211, "rewards/rejected": 6.204817295074463, "step": 4103 }, { "epoch": 0.91, "learning_rate": 5.979406542617212e-06, "logits/chosen": -1.556319236755371, "logits/rejected": -1.556319236755371, "logps/chosen": -50.937679290771484, "logps/rejected": -50.937679290771484, "loss": 0.6596, "rewards/accuracies": 0.0, "rewards/chosen": 3.3808581829071045, "rewards/margins": 0.0, "rewards/rejected": 3.3808581829071045, "step": 4104 }, { "epoch": 0.91, "learning_rate": 5.977648873327374e-06, "logits/chosen": -1.1511380672454834, "logits/rejected": -1.1960219144821167, "logps/chosen": -77.05996704101562, "logps/rejected": -45.091461181640625, "loss": 0.338, "rewards/accuracies": 1.0, "rewards/chosen": 3.532576084136963, "rewards/margins": 0.05178380012512207, "rewards/rejected": 3.480792284011841, "step": 4105 }, { "epoch": 0.91, "learning_rate": 5.975891078412004e-06, "logits/chosen": -1.854565143585205, "logits/rejected": -1.765631914138794, "logps/chosen": -165.0299530029297, "logps/rejected": -34.23893737792969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 10.392799377441406, "rewards/margins": 7.701417922973633, "rewards/rejected": 2.6913812160491943, "step": 4106 }, { "epoch": 0.91, "learning_rate": 5.974133158096977e-06, "logits/chosen": -1.6805528402328491, "logits/rejected": -1.6223676204681396, "logps/chosen": -31.706249237060547, "logps/rejected": -53.24574279785156, "loss": 0.4492, "rewards/accuracies": 1.0, "rewards/chosen": 2.892296314239502, "rewards/margins": 0.46775031089782715, "rewards/rejected": 2.424546003341675, "step": 4107 }, { "epoch": 0.91, "learning_rate": 5.972375112608182e-06, "logits/chosen": -1.6497771739959717, "logits/rejected": -1.6474982500076294, "logps/chosen": -39.2560920715332, "logps/rejected": -42.35792922973633, "loss": 1.6661, "rewards/accuracies": 0.0, "rewards/chosen": 1.954264521598816, "rewards/margins": -0.3561760187149048, "rewards/rejected": 2.3104405403137207, "step": 4108 }, { "epoch": 0.91, "learning_rate": 5.970616942171522e-06, "logits/chosen": -1.9347718954086304, "logits/rejected": -1.9204868078231812, "logps/chosen": -65.07916259765625, "logps/rejected": -35.81557083129883, "loss": 0.6174, "rewards/accuracies": 0.0, "rewards/chosen": 3.185345411300659, "rewards/margins": -0.27854061126708984, "rewards/rejected": 3.463886022567749, "step": 4109 }, { "epoch": 0.91, "learning_rate": 5.968858647012918e-06, "logits/chosen": -1.8883546590805054, "logits/rejected": -1.8882189989089966, "logps/chosen": -45.334686279296875, "logps/rejected": -81.95759582519531, "loss": 1.7426, "rewards/accuracies": 0.0, "rewards/chosen": 2.183739423751831, "rewards/margins": -1.933469533920288, "rewards/rejected": 4.117208957672119, "step": 4110 }, { "epoch": 0.91, "learning_rate": 5.967100227358307e-06, "logits/chosen": -1.732406497001648, "logits/rejected": -1.7366065979003906, "logps/chosen": -39.495384216308594, "logps/rejected": -33.16539764404297, "loss": 0.2938, "rewards/accuracies": 1.0, "rewards/chosen": 1.7277313470840454, "rewards/margins": 0.30582118034362793, "rewards/rejected": 1.4219101667404175, "step": 4111 }, { "epoch": 0.91, "learning_rate": 5.965341683433642e-06, "logits/chosen": -1.7647898197174072, "logits/rejected": -1.645523190498352, "logps/chosen": -101.3215103149414, "logps/rejected": -105.29792785644531, "loss": 1.8515, "rewards/accuracies": 0.0, "rewards/chosen": 5.762053966522217, "rewards/margins": -2.9109368324279785, "rewards/rejected": 8.672990798950195, "step": 4112 }, { "epoch": 0.91, "learning_rate": 5.963583015464891e-06, "logits/chosen": -1.4366328716278076, "logits/rejected": -1.3568599224090576, "logps/chosen": -40.04207992553711, "logps/rejected": -15.250020980834961, "loss": 0.5646, "rewards/accuracies": 1.0, "rewards/chosen": 2.306593418121338, "rewards/margins": 1.7362051010131836, "rewards/rejected": 0.5703882575035095, "step": 4113 }, { "epoch": 0.91, "learning_rate": 5.9618242236780386e-06, "logits/chosen": -1.559651255607605, "logits/rejected": -1.4650776386260986, "logps/chosen": -43.961753845214844, "logps/rejected": -42.395240783691406, "loss": 1.8992, "rewards/accuracies": 1.0, "rewards/chosen": 2.621288299560547, "rewards/margins": 0.5541102886199951, "rewards/rejected": 2.0671780109405518, "step": 4114 }, { "epoch": 0.91, "learning_rate": 5.9600653082990855e-06, "logits/chosen": -1.6937364339828491, "logits/rejected": -1.634405255317688, "logps/chosen": -46.96461868286133, "logps/rejected": -71.09303283691406, "loss": 0.7634, "rewards/accuracies": 0.0, "rewards/chosen": 5.202315807342529, "rewards/margins": -0.059957027435302734, "rewards/rejected": 5.262272834777832, "step": 4115 }, { "epoch": 0.91, "learning_rate": 5.9583062695540484e-06, "logits/chosen": -1.9075862169265747, "logits/rejected": -1.9310563802719116, "logps/chosen": -52.124298095703125, "logps/rejected": -54.61516571044922, "loss": 1.3194, "rewards/accuracies": 0.0, "rewards/chosen": 3.067028760910034, "rewards/margins": -1.7913568019866943, "rewards/rejected": 4.8583855628967285, "step": 4116 }, { "epoch": 0.91, "learning_rate": 5.95654710766896e-06, "logits/chosen": -1.543611764907837, "logits/rejected": -1.4958428144454956, "logps/chosen": -191.39132690429688, "logps/rejected": -149.72064208984375, "loss": 0.6223, "rewards/accuracies": 1.0, "rewards/chosen": 6.923248291015625, "rewards/margins": 1.2354307174682617, "rewards/rejected": 5.687817573547363, "step": 4117 }, { "epoch": 0.91, "learning_rate": 5.9547878228698675e-06, "logits/chosen": -1.508156180381775, "logits/rejected": -1.508156180381775, "logps/chosen": -48.03907775878906, "logps/rejected": -48.03907775878906, "loss": 0.8111, "rewards/accuracies": 0.0, "rewards/chosen": 3.531750440597534, "rewards/margins": 0.0, "rewards/rejected": 3.531750440597534, "step": 4118 }, { "epoch": 0.91, "learning_rate": 5.953028415382836e-06, "logits/chosen": -1.7614378929138184, "logits/rejected": -1.6724436283111572, "logps/chosen": -117.66121673583984, "logps/rejected": -69.3796615600586, "loss": 0.5801, "rewards/accuracies": 0.0, "rewards/chosen": 4.2515788078308105, "rewards/margins": -0.6743588447570801, "rewards/rejected": 4.925937652587891, "step": 4119 }, { "epoch": 0.91, "learning_rate": 5.9512688854339415e-06, "logits/chosen": -2.0151307582855225, "logits/rejected": -1.9985533952713013, "logps/chosen": -102.10151672363281, "logps/rejected": -110.87077331542969, "loss": 0.4036, "rewards/accuracies": 1.0, "rewards/chosen": 8.532889366149902, "rewards/margins": 1.1380176544189453, "rewards/rejected": 7.394871711730957, "step": 4120 }, { "epoch": 0.91, "learning_rate": 5.949509233249283e-06, "logits/chosen": -1.645658016204834, "logits/rejected": -1.6518255472183228, "logps/chosen": -95.97900390625, "logps/rejected": -88.602294921875, "loss": 2.2028, "rewards/accuracies": 0.0, "rewards/chosen": 2.458247423171997, "rewards/margins": -3.478585958480835, "rewards/rejected": 5.936833381652832, "step": 4121 }, { "epoch": 0.91, "learning_rate": 5.947749459054972e-06, "logits/chosen": -1.714937686920166, "logits/rejected": -1.6382075548171997, "logps/chosen": -84.89118957519531, "logps/rejected": -58.63191604614258, "loss": 0.0828, "rewards/accuracies": 1.0, "rewards/chosen": 4.898913860321045, "rewards/margins": 2.31186580657959, "rewards/rejected": 2.587048053741455, "step": 4122 }, { "epoch": 0.91, "learning_rate": 5.945989563077133e-06, "logits/chosen": -1.5995804071426392, "logits/rejected": -1.5230693817138672, "logps/chosen": -44.9838752746582, "logps/rejected": -15.622151374816895, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 2.216125965118408, "rewards/margins": 1.7389600276947021, "rewards/rejected": 0.47716590762138367, "step": 4123 }, { "epoch": 0.91, "learning_rate": 5.9442295455419085e-06, "logits/chosen": -1.4661812782287598, "logits/rejected": -1.4153810739517212, "logps/chosen": -95.06586456298828, "logps/rejected": -123.53425598144531, "loss": 0.7187, "rewards/accuracies": 0.0, "rewards/chosen": 4.642147064208984, "rewards/margins": -1.148679256439209, "rewards/rejected": 5.790826320648193, "step": 4124 }, { "epoch": 0.91, "learning_rate": 5.9424694066754584e-06, "logits/chosen": -1.4384987354278564, "logits/rejected": -1.3410978317260742, "logps/chosen": -36.778404235839844, "logps/rejected": -14.719386100769043, "loss": 0.8694, "rewards/accuracies": 1.0, "rewards/chosen": 1.2128639221191406, "rewards/margins": 0.0077697038650512695, "rewards/rejected": 1.2050942182540894, "step": 4125 }, { "epoch": 0.91, "learning_rate": 5.940709146703954e-06, "logits/chosen": -1.4521918296813965, "logits/rejected": -1.385899543762207, "logps/chosen": -38.44591522216797, "logps/rejected": -70.09008026123047, "loss": 0.612, "rewards/accuracies": 0.0, "rewards/chosen": 2.3215408325195312, "rewards/margins": -0.8214919567108154, "rewards/rejected": 3.1430327892303467, "step": 4126 }, { "epoch": 0.91, "learning_rate": 5.938948765853586e-06, "logits/chosen": -1.8292533159255981, "logits/rejected": -1.8312519788742065, "logps/chosen": -94.18709564208984, "logps/rejected": -28.618017196655273, "loss": 0.1741, "rewards/accuracies": 1.0, "rewards/chosen": 5.490519046783447, "rewards/margins": 3.632676601409912, "rewards/rejected": 1.8578424453735352, "step": 4127 }, { "epoch": 0.91, "learning_rate": 5.937188264350559e-06, "logits/chosen": -1.8980292081832886, "logits/rejected": -1.86538827419281, "logps/chosen": -49.65530776977539, "logps/rejected": -25.053743362426758, "loss": 0.1797, "rewards/accuracies": 1.0, "rewards/chosen": 2.8472485542297363, "rewards/margins": 2.110595941543579, "rewards/rejected": 0.7366525530815125, "step": 4128 }, { "epoch": 0.91, "learning_rate": 5.935427642421094e-06, "logits/chosen": -1.7036112546920776, "logits/rejected": -1.6842979192733765, "logps/chosen": -61.138092041015625, "logps/rejected": -69.92616271972656, "loss": 1.6608, "rewards/accuracies": 0.0, "rewards/chosen": 2.1052169799804688, "rewards/margins": -1.8250153064727783, "rewards/rejected": 3.930232286453247, "step": 4129 }, { "epoch": 0.91, "learning_rate": 5.9336669002914245e-06, "logits/chosen": -1.4858198165893555, "logits/rejected": -1.3883850574493408, "logps/chosen": -45.349998474121094, "logps/rejected": -34.618927001953125, "loss": 0.0964, "rewards/accuracies": 1.0, "rewards/chosen": 4.853890419006348, "rewards/margins": 1.6254639625549316, "rewards/rejected": 3.228426456451416, "step": 4130 }, { "epoch": 0.91, "learning_rate": 5.931906038187805e-06, "logits/chosen": -1.758754014968872, "logits/rejected": -1.6769049167633057, "logps/chosen": -92.2669677734375, "logps/rejected": -81.970947265625, "loss": 0.8575, "rewards/accuracies": 0.0, "rewards/chosen": 3.954205274581909, "rewards/margins": -1.4649078845977783, "rewards/rejected": 5.4191131591796875, "step": 4131 }, { "epoch": 0.91, "learning_rate": 5.930145056336497e-06, "logits/chosen": -1.5886456966400146, "logits/rejected": -1.5886456966400146, "logps/chosen": -83.0107192993164, "logps/rejected": -83.0107192993164, "loss": 0.4099, "rewards/accuracies": 0.0, "rewards/chosen": 3.6105141639709473, "rewards/margins": 0.0, "rewards/rejected": 3.6105141639709473, "step": 4132 }, { "epoch": 0.91, "learning_rate": 5.928383954963785e-06, "logits/chosen": -1.6471340656280518, "logits/rejected": -1.6497573852539062, "logps/chosen": -33.861392974853516, "logps/rejected": -46.579925537109375, "loss": 0.5427, "rewards/accuracies": 0.0, "rewards/chosen": 4.350992202758789, "rewards/margins": -0.6433110237121582, "rewards/rejected": 4.994303226470947, "step": 4133 }, { "epoch": 0.92, "learning_rate": 5.926622734295968e-06, "logits/chosen": -1.4350368976593018, "logits/rejected": -1.372752070426941, "logps/chosen": -56.26327896118164, "logps/rejected": -59.505210876464844, "loss": 0.0843, "rewards/accuracies": 1.0, "rewards/chosen": 1.9787991046905518, "rewards/margins": 1.8383407592773438, "rewards/rejected": 0.14045830070972443, "step": 4134 }, { "epoch": 0.92, "learning_rate": 5.924861394559357e-06, "logits/chosen": -1.3969367742538452, "logits/rejected": -1.3923479318618774, "logps/chosen": -39.776283264160156, "logps/rejected": -52.319618225097656, "loss": 0.8363, "rewards/accuracies": 0.0, "rewards/chosen": 3.5261406898498535, "rewards/margins": -0.775324821472168, "rewards/rejected": 4.3014655113220215, "step": 4135 }, { "epoch": 0.92, "learning_rate": 5.923099935980278e-06, "logits/chosen": -1.494859218597412, "logits/rejected": -1.4622305631637573, "logps/chosen": -94.9415283203125, "logps/rejected": -74.66261291503906, "loss": 0.6587, "rewards/accuracies": 0.0, "rewards/chosen": 5.204811096191406, "rewards/margins": -0.9759612083435059, "rewards/rejected": 6.180772304534912, "step": 4136 }, { "epoch": 0.92, "learning_rate": 5.921338358785079e-06, "logits/chosen": -1.4816581010818481, "logits/rejected": -1.4667532444000244, "logps/chosen": -46.242950439453125, "logps/rejected": -62.110965728759766, "loss": 0.7857, "rewards/accuracies": 0.0, "rewards/chosen": 1.9233620166778564, "rewards/margins": -1.195718765258789, "rewards/rejected": 3.1190807819366455, "step": 4137 }, { "epoch": 0.92, "learning_rate": 5.919576663200116e-06, "logits/chosen": -1.8611657619476318, "logits/rejected": -1.7612155675888062, "logps/chosen": -50.3342170715332, "logps/rejected": -22.577123641967773, "loss": 0.2436, "rewards/accuracies": 1.0, "rewards/chosen": 2.090759038925171, "rewards/margins": 0.676101565361023, "rewards/rejected": 1.414657473564148, "step": 4138 }, { "epoch": 0.92, "learning_rate": 5.917814849451762e-06, "logits/chosen": -1.5806330442428589, "logits/rejected": -1.5683979988098145, "logps/chosen": -58.50381851196289, "logps/rejected": -73.28813171386719, "loss": 1.368, "rewards/accuracies": 0.0, "rewards/chosen": 2.388578414916992, "rewards/margins": -1.5952298641204834, "rewards/rejected": 3.9838082790374756, "step": 4139 }, { "epoch": 0.92, "learning_rate": 5.916052917766406e-06, "logits/chosen": -1.7694411277770996, "logits/rejected": -1.7459957599639893, "logps/chosen": -51.9293098449707, "logps/rejected": -49.11316680908203, "loss": 1.3902, "rewards/accuracies": 0.0, "rewards/chosen": 2.3514554500579834, "rewards/margins": -1.954723596572876, "rewards/rejected": 4.306179046630859, "step": 4140 }, { "epoch": 0.92, "learning_rate": 5.914290868370451e-06, "logits/chosen": -1.9454355239868164, "logits/rejected": -1.941888451576233, "logps/chosen": -44.42519760131836, "logps/rejected": -55.81144714355469, "loss": 0.8437, "rewards/accuracies": 0.0, "rewards/chosen": 3.306506872177124, "rewards/margins": -0.507209300994873, "rewards/rejected": 3.813716173171997, "step": 4141 }, { "epoch": 0.92, "learning_rate": 5.912528701490317e-06, "logits/chosen": -1.844692587852478, "logits/rejected": -1.7694743871688843, "logps/chosen": -78.50070190429688, "logps/rejected": -61.6373291015625, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": 3.565800428390503, "rewards/margins": 2.250694990158081, "rewards/rejected": 1.3151054382324219, "step": 4142 }, { "epoch": 0.92, "learning_rate": 5.91076641735244e-06, "logits/chosen": -1.3857297897338867, "logits/rejected": -1.3723537921905518, "logps/chosen": -109.10518646240234, "logps/rejected": -76.6246337890625, "loss": 0.1834, "rewards/accuracies": 1.0, "rewards/chosen": 4.558431148529053, "rewards/margins": 1.6443898677825928, "rewards/rejected": 2.91404128074646, "step": 4143 }, { "epoch": 0.92, "learning_rate": 5.909004016183267e-06, "logits/chosen": -1.640367865562439, "logits/rejected": -1.585172414779663, "logps/chosen": -41.98590087890625, "logps/rejected": -26.55482292175293, "loss": 0.1907, "rewards/accuracies": 1.0, "rewards/chosen": 2.337416887283325, "rewards/margins": 1.422835350036621, "rewards/rejected": 0.9145814776420593, "step": 4144 }, { "epoch": 0.92, "learning_rate": 5.907241498209264e-06, "logits/chosen": -1.7175198793411255, "logits/rejected": -1.686683177947998, "logps/chosen": -91.1650161743164, "logps/rejected": -61.91529846191406, "loss": 0.3727, "rewards/accuracies": 1.0, "rewards/chosen": 4.824051856994629, "rewards/margins": 0.6581201553344727, "rewards/rejected": 4.165931701660156, "step": 4145 }, { "epoch": 0.92, "learning_rate": 5.905478863656909e-06, "logits/chosen": -1.868255376815796, "logits/rejected": -1.8769806623458862, "logps/chosen": -31.489044189453125, "logps/rejected": -74.79582214355469, "loss": 0.9069, "rewards/accuracies": 0.0, "rewards/chosen": 3.2780539989471436, "rewards/margins": -1.5922386646270752, "rewards/rejected": 4.870292663574219, "step": 4146 }, { "epoch": 0.92, "learning_rate": 5.903716112752696e-06, "logits/chosen": -1.6730539798736572, "logits/rejected": -1.6708509922027588, "logps/chosen": -54.99408721923828, "logps/rejected": -51.64652633666992, "loss": 0.309, "rewards/accuracies": 1.0, "rewards/chosen": 3.684532880783081, "rewards/margins": 0.30040454864501953, "rewards/rejected": 3.3841283321380615, "step": 4147 }, { "epoch": 0.92, "learning_rate": 5.901953245723137e-06, "logits/chosen": -1.4688811302185059, "logits/rejected": -1.4688811302185059, "logps/chosen": -7.395987510681152, "logps/rejected": -7.395987510681152, "loss": 0.3494, "rewards/accuracies": 0.0, "rewards/chosen": 1.0591243505477905, "rewards/margins": 0.0, "rewards/rejected": 1.0591243505477905, "step": 4148 }, { "epoch": 0.92, "learning_rate": 5.900190262794752e-06, "logits/chosen": -1.8881806135177612, "logits/rejected": -1.909039855003357, "logps/chosen": -82.98197937011719, "logps/rejected": -55.39186096191406, "loss": 0.1654, "rewards/accuracies": 1.0, "rewards/chosen": 5.633963108062744, "rewards/margins": 1.2582182884216309, "rewards/rejected": 4.375744819641113, "step": 4149 }, { "epoch": 0.92, "learning_rate": 5.898427164194084e-06, "logits/chosen": -1.4126527309417725, "logits/rejected": -1.377252221107483, "logps/chosen": -32.54536819458008, "logps/rejected": -28.103713989257812, "loss": 0.2833, "rewards/accuracies": 1.0, "rewards/chosen": 1.778899073600769, "rewards/margins": 0.5701375007629395, "rewards/rejected": 1.2087615728378296, "step": 4150 }, { "epoch": 0.92, "learning_rate": 5.896663950147684e-06, "logits/chosen": -1.5882962942123413, "logits/rejected": -1.5870198011398315, "logps/chosen": -58.21453857421875, "logps/rejected": -52.00872039794922, "loss": 1.193, "rewards/accuracies": 1.0, "rewards/chosen": 4.935965061187744, "rewards/margins": 1.4049019813537598, "rewards/rejected": 3.5310630798339844, "step": 4151 }, { "epoch": 0.92, "learning_rate": 5.8949006208821215e-06, "logits/chosen": -1.694698691368103, "logits/rejected": -1.5486739873886108, "logps/chosen": -75.59902954101562, "logps/rejected": -5.432713031768799, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 3.393786668777466, "rewards/margins": 2.3737292289733887, "rewards/rejected": 1.0200575590133667, "step": 4152 }, { "epoch": 0.92, "learning_rate": 5.893137176623981e-06, "logits/chosen": -1.5721241235733032, "logits/rejected": -1.5413066148757935, "logps/chosen": -104.73782348632812, "logps/rejected": -79.01246643066406, "loss": 1.2675, "rewards/accuracies": 0.0, "rewards/chosen": 5.6563873291015625, "rewards/margins": -1.3861174583435059, "rewards/rejected": 7.042504787445068, "step": 4153 }, { "epoch": 0.92, "learning_rate": 5.891373617599861e-06, "logits/chosen": -1.6820924282073975, "logits/rejected": -1.6247310638427734, "logps/chosen": -146.8184356689453, "logps/rejected": -127.03720092773438, "loss": 0.184, "rewards/accuracies": 1.0, "rewards/chosen": 7.806654453277588, "rewards/margins": 0.8577132225036621, "rewards/rejected": 6.948941230773926, "step": 4154 }, { "epoch": 0.92, "learning_rate": 5.889609944036373e-06, "logits/chosen": -1.7635562419891357, "logits/rejected": -1.6609692573547363, "logps/chosen": -80.4281234741211, "logps/rejected": -56.67249298095703, "loss": 0.8046, "rewards/accuracies": 1.0, "rewards/chosen": 6.5756449699401855, "rewards/margins": 1.380781650543213, "rewards/rejected": 5.194863319396973, "step": 4155 }, { "epoch": 0.92, "learning_rate": 5.887846156160147e-06, "logits/chosen": -1.501477599143982, "logits/rejected": -1.5526307821273804, "logps/chosen": -49.329345703125, "logps/rejected": -68.02027130126953, "loss": 1.6323, "rewards/accuracies": 0.0, "rewards/chosen": 1.5697624683380127, "rewards/margins": -2.0958266258239746, "rewards/rejected": 3.6655890941619873, "step": 4156 }, { "epoch": 0.92, "learning_rate": 5.8860822541978225e-06, "logits/chosen": -1.7696160078048706, "logits/rejected": -1.7467784881591797, "logps/chosen": -54.070228576660156, "logps/rejected": -47.02802658081055, "loss": 0.655, "rewards/accuracies": 0.0, "rewards/chosen": 1.8617485761642456, "rewards/margins": -0.9928134679794312, "rewards/rejected": 2.8545620441436768, "step": 4157 }, { "epoch": 0.92, "learning_rate": 5.884318238376059e-06, "logits/chosen": -1.6060467958450317, "logits/rejected": -1.0839158296585083, "logps/chosen": -40.70450210571289, "logps/rejected": -109.88015747070312, "loss": 1.3296, "rewards/accuracies": 0.0, "rewards/chosen": 2.2424397468566895, "rewards/margins": -2.0983762741088867, "rewards/rejected": 4.340816020965576, "step": 4158 }, { "epoch": 0.92, "learning_rate": 5.882554108921528e-06, "logits/chosen": -1.6278811693191528, "logits/rejected": -1.6018896102905273, "logps/chosen": -97.96315002441406, "logps/rejected": -112.65762329101562, "loss": 1.0655, "rewards/accuracies": 0.0, "rewards/chosen": 4.012368679046631, "rewards/margins": -1.8218460083007812, "rewards/rejected": 5.834214687347412, "step": 4159 }, { "epoch": 0.92, "learning_rate": 5.880789866060916e-06, "logits/chosen": -1.6890006065368652, "logits/rejected": -1.5851490497589111, "logps/chosen": -159.71583557128906, "logps/rejected": -61.28202438354492, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 7.0162200927734375, "rewards/margins": 3.6141841411590576, "rewards/rejected": 3.40203595161438, "step": 4160 }, { "epoch": 0.92, "learning_rate": 5.879025510020924e-06, "logits/chosen": -1.839282512664795, "logits/rejected": -1.8277696371078491, "logps/chosen": -65.76058959960938, "logps/rejected": -74.81375885009766, "loss": 0.3428, "rewards/accuracies": 1.0, "rewards/chosen": 6.385887145996094, "rewards/margins": 0.017726898193359375, "rewards/rejected": 6.368160247802734, "step": 4161 }, { "epoch": 0.92, "learning_rate": 5.877261041028266e-06, "logits/chosen": -1.7658976316452026, "logits/rejected": -1.7383966445922852, "logps/chosen": -57.33575439453125, "logps/rejected": -82.60258483886719, "loss": 0.2028, "rewards/accuracies": 1.0, "rewards/chosen": 3.447554111480713, "rewards/margins": 0.7787246704101562, "rewards/rejected": 2.6688294410705566, "step": 4162 }, { "epoch": 0.92, "learning_rate": 5.875496459309673e-06, "logits/chosen": -1.865599274635315, "logits/rejected": -1.9240278005599976, "logps/chosen": -80.74278259277344, "logps/rejected": -106.60189056396484, "loss": 2.0966, "rewards/accuracies": 0.0, "rewards/chosen": 5.66793966293335, "rewards/margins": -2.3582968711853027, "rewards/rejected": 8.026236534118652, "step": 4163 }, { "epoch": 0.92, "learning_rate": 5.8737317650918905e-06, "logits/chosen": -1.6360368728637695, "logits/rejected": -1.5807138681411743, "logps/chosen": -46.14063262939453, "logps/rejected": -41.85446548461914, "loss": 0.9173, "rewards/accuracies": 1.0, "rewards/chosen": 5.714552402496338, "rewards/margins": 1.3519673347473145, "rewards/rejected": 4.362585067749023, "step": 4164 }, { "epoch": 0.92, "learning_rate": 5.871966958601676e-06, "logits/chosen": -1.6804002523422241, "logits/rejected": -1.6046411991119385, "logps/chosen": -143.084716796875, "logps/rejected": -58.62226104736328, "loss": 0.0778, "rewards/accuracies": 1.0, "rewards/chosen": 5.990042209625244, "rewards/margins": 3.560450792312622, "rewards/rejected": 2.429591417312622, "step": 4165 }, { "epoch": 0.92, "learning_rate": 5.870202040065803e-06, "logits/chosen": -1.8302769660949707, "logits/rejected": -1.8146612644195557, "logps/chosen": -89.38076782226562, "logps/rejected": -70.19732666015625, "loss": 0.7573, "rewards/accuracies": 0.0, "rewards/chosen": 3.0685837268829346, "rewards/margins": -1.2320473194122314, "rewards/rejected": 4.300631046295166, "step": 4166 }, { "epoch": 0.92, "learning_rate": 5.86843700971106e-06, "logits/chosen": -1.2592010498046875, "logits/rejected": -1.2592010498046875, "logps/chosen": -3.8140993118286133, "logps/rejected": -3.8140993118286133, "loss": 0.5969, "rewards/accuracies": 0.0, "rewards/chosen": 0.8965674638748169, "rewards/margins": 0.0, "rewards/rejected": 0.8965674638748169, "step": 4167 }, { "epoch": 0.92, "learning_rate": 5.866671867764249e-06, "logits/chosen": -1.8517817258834839, "logits/rejected": -1.859954595565796, "logps/chosen": -36.41065216064453, "logps/rejected": -89.07090759277344, "loss": 3.8821, "rewards/accuracies": 0.0, "rewards/chosen": 3.106160879135132, "rewards/margins": -1.3927085399627686, "rewards/rejected": 4.4988694190979, "step": 4168 }, { "epoch": 0.92, "learning_rate": 5.864906614452186e-06, "logits/chosen": -1.2721480131149292, "logits/rejected": -1.271324634552002, "logps/chosen": -31.177783966064453, "logps/rejected": -28.06182098388672, "loss": 0.6885, "rewards/accuracies": 0.0, "rewards/chosen": 2.4528491497039795, "rewards/margins": -0.9007482528686523, "rewards/rejected": 3.353597402572632, "step": 4169 }, { "epoch": 0.92, "learning_rate": 5.863141250001705e-06, "logits/chosen": -1.7362853288650513, "logits/rejected": -1.6621986627578735, "logps/chosen": -52.47340393066406, "logps/rejected": -42.89638137817383, "loss": 0.4043, "rewards/accuracies": 1.0, "rewards/chosen": 2.9625167846679688, "rewards/margins": 0.5019075870513916, "rewards/rejected": 2.460609197616577, "step": 4170 }, { "epoch": 0.92, "learning_rate": 5.861375774639645e-06, "logits/chosen": -1.6657871007919312, "logits/rejected": -1.6611498594284058, "logps/chosen": -58.101585388183594, "logps/rejected": -72.65773010253906, "loss": 0.4087, "rewards/accuracies": 0.0, "rewards/chosen": 3.377927541732788, "rewards/margins": -0.028420209884643555, "rewards/rejected": 3.4063477516174316, "step": 4171 }, { "epoch": 0.92, "learning_rate": 5.859610188592871e-06, "logits/chosen": -1.454615592956543, "logits/rejected": -1.4255573749542236, "logps/chosen": -46.581119537353516, "logps/rejected": -32.76447296142578, "loss": 0.2322, "rewards/accuracies": 1.0, "rewards/chosen": 3.7783749103546143, "rewards/margins": 1.3979504108428955, "rewards/rejected": 2.3804244995117188, "step": 4172 }, { "epoch": 0.92, "learning_rate": 5.8578444920882525e-06, "logits/chosen": -1.6481069326400757, "logits/rejected": -1.4746506214141846, "logps/chosen": -188.41964721679688, "logps/rejected": -23.462182998657227, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 7.3161468505859375, "rewards/margins": 7.211514472961426, "rewards/rejected": 0.10463237762451172, "step": 4173 }, { "epoch": 0.92, "learning_rate": 5.856078685352679e-06, "logits/chosen": -1.6267116069793701, "logits/rejected": -1.6359906196594238, "logps/chosen": -98.39315032958984, "logps/rejected": -145.89352416992188, "loss": 0.4819, "rewards/accuracies": 1.0, "rewards/chosen": 6.844907283782959, "rewards/margins": 1.8559331893920898, "rewards/rejected": 4.988974094390869, "step": 4174 }, { "epoch": 0.92, "learning_rate": 5.854312768613053e-06, "logits/chosen": -1.6168334484100342, "logits/rejected": -1.570226788520813, "logps/chosen": -38.500762939453125, "logps/rejected": -51.72056198120117, "loss": 0.3486, "rewards/accuracies": 1.0, "rewards/chosen": 3.3887710571289062, "rewards/margins": 0.04031109809875488, "rewards/rejected": 3.3484599590301514, "step": 4175 }, { "epoch": 0.92, "learning_rate": 5.85254674209629e-06, "logits/chosen": -1.6322200298309326, "logits/rejected": -1.6169911623001099, "logps/chosen": -87.69217681884766, "logps/rejected": -28.783823013305664, "loss": 0.3607, "rewards/accuracies": 1.0, "rewards/chosen": 5.6438422203063965, "rewards/margins": 2.4635837078094482, "rewards/rejected": 3.1802585124969482, "step": 4176 }, { "epoch": 0.92, "learning_rate": 5.850780606029319e-06, "logits/chosen": -1.5796254873275757, "logits/rejected": -1.6299350261688232, "logps/chosen": -41.67695236206055, "logps/rejected": -107.08601379394531, "loss": 2.5658, "rewards/accuracies": 0.0, "rewards/chosen": 3.9454212188720703, "rewards/margins": -4.756805419921875, "rewards/rejected": 8.702226638793945, "step": 4177 }, { "epoch": 0.92, "learning_rate": 5.849014360639087e-06, "logits/chosen": -1.5298336744308472, "logits/rejected": -1.5298336744308472, "logps/chosen": -20.941862106323242, "logps/rejected": -20.941862106323242, "loss": 0.3832, "rewards/accuracies": 0.0, "rewards/chosen": 1.5362520217895508, "rewards/margins": 0.0, "rewards/rejected": 1.5362520217895508, "step": 4178 }, { "epoch": 0.92, "learning_rate": 5.847248006152549e-06, "logits/chosen": -1.8609709739685059, "logits/rejected": -1.8238520622253418, "logps/chosen": -126.27401733398438, "logps/rejected": -48.91057586669922, "loss": 1.861, "rewards/accuracies": 1.0, "rewards/chosen": 5.669915676116943, "rewards/margins": 2.323967695236206, "rewards/rejected": 3.3459479808807373, "step": 4179 }, { "epoch": 0.93, "learning_rate": 5.845481542796681e-06, "logits/chosen": -1.3708531856536865, "logits/rejected": -1.3708531856536865, "logps/chosen": -83.89922332763672, "logps/rejected": -83.89922332763672, "loss": 0.4955, "rewards/accuracies": 0.0, "rewards/chosen": 2.4523308277130127, "rewards/margins": 0.0, "rewards/rejected": 2.4523308277130127, "step": 4180 }, { "epoch": 0.93, "learning_rate": 5.843714970798466e-06, "logits/chosen": -2.036292791366577, "logits/rejected": -1.487876296043396, "logps/chosen": -147.1263885498047, "logps/rejected": -45.883323669433594, "loss": 0.1529, "rewards/accuracies": 1.0, "rewards/chosen": 7.410869121551514, "rewards/margins": 3.825242042541504, "rewards/rejected": 3.5856270790100098, "step": 4181 }, { "epoch": 0.93, "learning_rate": 5.8419482903849066e-06, "logits/chosen": -1.5900757312774658, "logits/rejected": -1.5313889980316162, "logps/chosen": -75.0361328125, "logps/rejected": -69.05934143066406, "loss": 0.5158, "rewards/accuracies": 0.0, "rewards/chosen": 3.0645904541015625, "rewards/margins": -0.4883286952972412, "rewards/rejected": 3.5529191493988037, "step": 4182 }, { "epoch": 0.93, "learning_rate": 5.840181501783014e-06, "logits/chosen": -1.5329535007476807, "logits/rejected": -1.590698480606079, "logps/chosen": -109.52760314941406, "logps/rejected": -52.537147521972656, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 5.070367336273193, "rewards/margins": 2.151899576187134, "rewards/rejected": 2.9184677600860596, "step": 4183 }, { "epoch": 0.93, "learning_rate": 5.838414605219819e-06, "logits/chosen": -1.5566307306289673, "logits/rejected": -1.527056336402893, "logps/chosen": -79.85711669921875, "logps/rejected": -90.05659484863281, "loss": 0.1741, "rewards/accuracies": 1.0, "rewards/chosen": 4.978886604309082, "rewards/margins": 0.9563875198364258, "rewards/rejected": 4.022499084472656, "step": 4184 }, { "epoch": 0.93, "learning_rate": 5.836647600922363e-06, "logits/chosen": -2.1138367652893066, "logits/rejected": -2.064901828765869, "logps/chosen": -83.20864868164062, "logps/rejected": -116.62870025634766, "loss": 0.7125, "rewards/accuracies": 1.0, "rewards/chosen": 6.721848964691162, "rewards/margins": 0.1790485382080078, "rewards/rejected": 6.542800426483154, "step": 4185 }, { "epoch": 0.93, "learning_rate": 5.8348804891177e-06, "logits/chosen": -1.8246630430221558, "logits/rejected": -1.5651847124099731, "logps/chosen": -108.61014556884766, "logps/rejected": -99.50770568847656, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 7.624582767486572, "rewards/margins": 5.671124458312988, "rewards/rejected": 1.9534584283828735, "step": 4186 }, { "epoch": 0.93, "learning_rate": 5.833113270032903e-06, "logits/chosen": -1.574593186378479, "logits/rejected": -1.5761713981628418, "logps/chosen": -41.04888153076172, "logps/rejected": -83.77780151367188, "loss": 1.4055, "rewards/accuracies": 0.0, "rewards/chosen": 2.475937604904175, "rewards/margins": -1.0945603847503662, "rewards/rejected": 3.570497989654541, "step": 4187 }, { "epoch": 0.93, "learning_rate": 5.831345943895054e-06, "logits/chosen": -1.563035249710083, "logits/rejected": -1.4627721309661865, "logps/chosen": -47.62641143798828, "logps/rejected": -12.025093078613281, "loss": 0.3814, "rewards/accuracies": 1.0, "rewards/chosen": 2.5490806102752686, "rewards/margins": 1.5314408540725708, "rewards/rejected": 1.0176397562026978, "step": 4188 }, { "epoch": 0.93, "learning_rate": 5.829578510931249e-06, "logits/chosen": -1.7317898273468018, "logits/rejected": -1.645308017730713, "logps/chosen": -94.29624938964844, "logps/rejected": -53.38544845581055, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 7.963482856750488, "rewards/margins": 4.657985687255859, "rewards/rejected": 3.305497407913208, "step": 4189 }, { "epoch": 0.93, "learning_rate": 5.827810971368598e-06, "logits/chosen": -1.7339664697647095, "logits/rejected": -1.663555383682251, "logps/chosen": -51.34978485107422, "logps/rejected": -88.8525390625, "loss": 0.36, "rewards/accuracies": 1.0, "rewards/chosen": 3.0374717712402344, "rewards/margins": 0.9271728992462158, "rewards/rejected": 2.1102988719940186, "step": 4190 }, { "epoch": 0.93, "learning_rate": 5.82604332543423e-06, "logits/chosen": -1.7921737432479858, "logits/rejected": -1.813628911972046, "logps/chosen": -97.20123291015625, "logps/rejected": -89.23863220214844, "loss": 0.7444, "rewards/accuracies": 0.0, "rewards/chosen": 5.816694736480713, "rewards/margins": -1.0930571556091309, "rewards/rejected": 6.909751892089844, "step": 4191 }, { "epoch": 0.93, "learning_rate": 5.824275573355278e-06, "logits/chosen": -1.875550627708435, "logits/rejected": -1.8837571144104004, "logps/chosen": -54.1448974609375, "logps/rejected": -44.40851593017578, "loss": 1.524, "rewards/accuracies": 0.0, "rewards/chosen": 2.336013078689575, "rewards/margins": -1.8046576976776123, "rewards/rejected": 4.1406707763671875, "step": 4192 }, { "epoch": 0.93, "learning_rate": 5.822507715358897e-06, "logits/chosen": -1.7518001794815063, "logits/rejected": -1.350182056427002, "logps/chosen": -40.091400146484375, "logps/rejected": -143.3791961669922, "loss": 3.5231, "rewards/accuracies": 0.0, "rewards/chosen": 3.085527181625366, "rewards/margins": -4.416082382202148, "rewards/rejected": 7.501609802246094, "step": 4193 }, { "epoch": 0.93, "learning_rate": 5.820739751672252e-06, "logits/chosen": -1.739972710609436, "logits/rejected": -1.739972710609436, "logps/chosen": -61.70177459716797, "logps/rejected": -61.70177459716797, "loss": 0.7253, "rewards/accuracies": 0.0, "rewards/chosen": 2.7143020629882812, "rewards/margins": 0.0, "rewards/rejected": 2.7143020629882812, "step": 4194 }, { "epoch": 0.93, "learning_rate": 5.818971682522522e-06, "logits/chosen": -1.823140025138855, "logits/rejected": -1.8204926252365112, "logps/chosen": -87.18074798583984, "logps/rejected": -107.56636810302734, "loss": 0.1853, "rewards/accuracies": 1.0, "rewards/chosen": 5.0670952796936035, "rewards/margins": 1.1375916004180908, "rewards/rejected": 3.9295036792755127, "step": 4195 }, { "epoch": 0.93, "learning_rate": 5.817203508136898e-06, "logits/chosen": -1.6749719381332397, "logits/rejected": -1.719759464263916, "logps/chosen": -61.37724304199219, "logps/rejected": -71.31729888916016, "loss": 1.0762, "rewards/accuracies": 1.0, "rewards/chosen": 3.2533950805664062, "rewards/margins": 0.2755453586578369, "rewards/rejected": 2.9778497219085693, "step": 4196 }, { "epoch": 0.93, "learning_rate": 5.815435228742591e-06, "logits/chosen": -1.719517469406128, "logits/rejected": -1.7224103212356567, "logps/chosen": -63.96522521972656, "logps/rejected": -46.374664306640625, "loss": 0.811, "rewards/accuracies": 0.0, "rewards/chosen": 2.464003801345825, "rewards/margins": -0.42616724967956543, "rewards/rejected": 2.8901710510253906, "step": 4197 }, { "epoch": 0.93, "learning_rate": 5.813666844566815e-06, "logits/chosen": -1.888938307762146, "logits/rejected": -1.976661205291748, "logps/chosen": -46.905967712402344, "logps/rejected": -70.18756103515625, "loss": 1.4512, "rewards/accuracies": 0.0, "rewards/chosen": 2.8055343627929688, "rewards/margins": -2.6828384399414062, "rewards/rejected": 5.488372802734375, "step": 4198 }, { "epoch": 0.93, "learning_rate": 5.811898355836806e-06, "logits/chosen": -1.5118719339370728, "logits/rejected": -1.4143881797790527, "logps/chosen": -42.633663177490234, "logps/rejected": -50.010379791259766, "loss": 0.935, "rewards/accuracies": 1.0, "rewards/chosen": 5.739680767059326, "rewards/margins": 4.486298084259033, "rewards/rejected": 1.2533825635910034, "step": 4199 }, { "epoch": 0.93, "learning_rate": 5.8101297627798105e-06, "logits/chosen": -1.8247618675231934, "logits/rejected": -1.743093490600586, "logps/chosen": -51.98989486694336, "logps/rejected": -9.85204792022705, "loss": 0.4116, "rewards/accuracies": 1.0, "rewards/chosen": 1.7543102502822876, "rewards/margins": 0.15613985061645508, "rewards/rejected": 1.5981703996658325, "step": 4200 }, { "epoch": 0.93, "learning_rate": 5.8083610656230874e-06, "logits/chosen": -1.8794691562652588, "logits/rejected": -1.8464292287826538, "logps/chosen": -98.49504089355469, "logps/rejected": -77.14308166503906, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": 5.2378249168396, "rewards/margins": 3.491766929626465, "rewards/rejected": 1.7460578680038452, "step": 4201 }, { "epoch": 0.93, "learning_rate": 5.806592264593911e-06, "logits/chosen": -1.4947792291641235, "logits/rejected": -1.4980881214141846, "logps/chosen": -42.05538558959961, "logps/rejected": -40.70185089111328, "loss": 1.6757, "rewards/accuracies": 0.0, "rewards/chosen": 1.982817530632019, "rewards/margins": -0.09696757793426514, "rewards/rejected": 2.079785108566284, "step": 4202 }, { "epoch": 0.93, "learning_rate": 5.8048233599195665e-06, "logits/chosen": -1.6331762075424194, "logits/rejected": -1.6799802780151367, "logps/chosen": -128.48316955566406, "logps/rejected": -165.31893920898438, "loss": 1.1576, "rewards/accuracies": 0.0, "rewards/chosen": 4.919926643371582, "rewards/margins": -2.208874225616455, "rewards/rejected": 7.128800868988037, "step": 4203 }, { "epoch": 0.93, "learning_rate": 5.803054351827356e-06, "logits/chosen": -2.084294557571411, "logits/rejected": -2.089831829071045, "logps/chosen": -61.48610305786133, "logps/rejected": -136.9990234375, "loss": 2.3155, "rewards/accuracies": 0.0, "rewards/chosen": 5.610203266143799, "rewards/margins": -0.535820484161377, "rewards/rejected": 6.146023750305176, "step": 4204 }, { "epoch": 0.93, "learning_rate": 5.801285240544593e-06, "logits/chosen": -1.830432415008545, "logits/rejected": -1.815502643585205, "logps/chosen": -67.13294219970703, "logps/rejected": -79.54817199707031, "loss": 0.0682, "rewards/accuracies": 1.0, "rewards/chosen": 4.954537391662598, "rewards/margins": 1.935349464416504, "rewards/rejected": 3.0191879272460938, "step": 4205 }, { "epoch": 0.93, "learning_rate": 5.799516026298601e-06, "logits/chosen": -1.729896068572998, "logits/rejected": -1.5435566902160645, "logps/chosen": -125.80265808105469, "logps/rejected": -17.621715545654297, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 4.430119514465332, "rewards/margins": 4.237665176391602, "rewards/rejected": 0.19245433807373047, "step": 4206 }, { "epoch": 0.93, "learning_rate": 5.797746709316722e-06, "logits/chosen": -2.0490036010742188, "logits/rejected": -1.6942673921585083, "logps/chosen": -53.172821044921875, "logps/rejected": -73.21011352539062, "loss": 2.7954, "rewards/accuracies": 0.0, "rewards/chosen": 3.3382515907287598, "rewards/margins": -0.3494575023651123, "rewards/rejected": 3.687709093093872, "step": 4207 }, { "epoch": 0.93, "learning_rate": 5.795977289826308e-06, "logits/chosen": -1.7504132986068726, "logits/rejected": -1.6617352962493896, "logps/chosen": -57.439292907714844, "logps/rejected": -59.27217102050781, "loss": 0.7546, "rewards/accuracies": 0.0, "rewards/chosen": 1.9452629089355469, "rewards/margins": -0.5077598094940186, "rewards/rejected": 2.4530227184295654, "step": 4208 }, { "epoch": 0.93, "learning_rate": 5.794207768054726e-06, "logits/chosen": -1.7427984476089478, "logits/rejected": -1.7261962890625, "logps/chosen": -50.54860305786133, "logps/rejected": -54.61432647705078, "loss": 1.153, "rewards/accuracies": 0.0, "rewards/chosen": 2.2688167095184326, "rewards/margins": -0.6914525032043457, "rewards/rejected": 2.9602692127227783, "step": 4209 }, { "epoch": 0.93, "learning_rate": 5.7924381442293554e-06, "logits/chosen": -1.5307172536849976, "logits/rejected": -1.5307172536849976, "logps/chosen": -66.97824096679688, "logps/rejected": -66.97824096679688, "loss": 0.7862, "rewards/accuracies": 0.0, "rewards/chosen": 3.2808823585510254, "rewards/margins": 0.0, "rewards/rejected": 3.2808823585510254, "step": 4210 }, { "epoch": 0.93, "learning_rate": 5.790668418577588e-06, "logits/chosen": -1.7045083045959473, "logits/rejected": -1.728589415550232, "logps/chosen": -35.158782958984375, "logps/rejected": -130.84567260742188, "loss": 2.3215, "rewards/accuracies": 0.0, "rewards/chosen": 2.578782796859741, "rewards/margins": -2.7207887172698975, "rewards/rejected": 5.299571514129639, "step": 4211 }, { "epoch": 0.93, "learning_rate": 5.7888985913268305e-06, "logits/chosen": -1.6327292919158936, "logits/rejected": -1.587909460067749, "logps/chosen": -70.39918518066406, "logps/rejected": -68.08222198486328, "loss": 0.496, "rewards/accuracies": 1.0, "rewards/chosen": 3.7770767211914062, "rewards/margins": 0.6626236438751221, "rewards/rejected": 3.114453077316284, "step": 4212 }, { "epoch": 0.93, "learning_rate": 5.7871286627045e-06, "logits/chosen": -1.6153234243392944, "logits/rejected": -1.6153234243392944, "logps/chosen": -39.53354263305664, "logps/rejected": -39.53354263305664, "loss": 0.3841, "rewards/accuracies": 0.0, "rewards/chosen": 2.0524463653564453, "rewards/margins": 0.0, "rewards/rejected": 2.0524463653564453, "step": 4213 }, { "epoch": 0.93, "learning_rate": 5.785358632938028e-06, "logits/chosen": -2.0697150230407715, "logits/rejected": -1.9789869785308838, "logps/chosen": -57.19340896606445, "logps/rejected": -60.80976486206055, "loss": 0.2903, "rewards/accuracies": 1.0, "rewards/chosen": 3.966198444366455, "rewards/margins": 1.585395097732544, "rewards/rejected": 2.380803346633911, "step": 4214 }, { "epoch": 0.93, "learning_rate": 5.7835885022548595e-06, "logits/chosen": -1.7354485988616943, "logits/rejected": -1.7149752378463745, "logps/chosen": -55.607025146484375, "logps/rejected": -36.47392272949219, "loss": 0.4215, "rewards/accuracies": 0.0, "rewards/chosen": 2.671823263168335, "rewards/margins": -0.021210432052612305, "rewards/rejected": 2.6930336952209473, "step": 4215 }, { "epoch": 0.93, "learning_rate": 5.781818270882453e-06, "logits/chosen": -1.8603222370147705, "logits/rejected": -1.7804975509643555, "logps/chosen": -112.12407684326172, "logps/rejected": -59.86465835571289, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": 7.647445201873779, "rewards/margins": 3.910046339035034, "rewards/rejected": 3.737398862838745, "step": 4216 }, { "epoch": 0.93, "learning_rate": 5.7800479390482776e-06, "logits/chosen": -1.5134295225143433, "logits/rejected": -1.3376408815383911, "logps/chosen": -49.587242126464844, "logps/rejected": -22.608327865600586, "loss": 0.081, "rewards/accuracies": 1.0, "rewards/chosen": 2.906141757965088, "rewards/margins": 2.542863368988037, "rewards/rejected": 0.3632783889770508, "step": 4217 }, { "epoch": 0.93, "learning_rate": 5.778277506979817e-06, "logits/chosen": -1.517580270767212, "logits/rejected": -1.5926311016082764, "logps/chosen": -60.78312683105469, "logps/rejected": -91.5118408203125, "loss": 1.7972, "rewards/accuracies": 0.0, "rewards/chosen": 2.0187454223632812, "rewards/margins": -2.781507968902588, "rewards/rejected": 4.800253391265869, "step": 4218 }, { "epoch": 0.93, "learning_rate": 5.7765069749045674e-06, "logits/chosen": -1.7806453704833984, "logits/rejected": -1.7404733896255493, "logps/chosen": -56.992435455322266, "logps/rejected": -58.399085998535156, "loss": 0.1579, "rewards/accuracies": 1.0, "rewards/chosen": 4.380527973175049, "rewards/margins": 1.293731451034546, "rewards/rejected": 3.086796522140503, "step": 4219 }, { "epoch": 0.93, "learning_rate": 5.7747363430500395e-06, "logits/chosen": -1.6800124645233154, "logits/rejected": -1.7279739379882812, "logps/chosen": -62.60627746582031, "logps/rejected": -87.10395812988281, "loss": 1.3794, "rewards/accuracies": 0.0, "rewards/chosen": 2.2149384021759033, "rewards/margins": -2.300541639328003, "rewards/rejected": 4.515480041503906, "step": 4220 }, { "epoch": 0.93, "learning_rate": 5.772965611643754e-06, "logits/chosen": -1.8500120639801025, "logits/rejected": -1.856873631477356, "logps/chosen": -69.6783676147461, "logps/rejected": -46.759674072265625, "loss": 0.4363, "rewards/accuracies": 0.0, "rewards/chosen": 3.9218666553497314, "rewards/margins": -0.10892128944396973, "rewards/rejected": 4.030787944793701, "step": 4221 }, { "epoch": 0.93, "learning_rate": 5.771194780913244e-06, "logits/chosen": -1.9648582935333252, "logits/rejected": -1.9950265884399414, "logps/chosen": -115.34359741210938, "logps/rejected": -99.29865264892578, "loss": 0.0967, "rewards/accuracies": 1.0, "rewards/chosen": 5.943139553070068, "rewards/margins": 1.5941929817199707, "rewards/rejected": 4.348946571350098, "step": 4222 }, { "epoch": 0.93, "learning_rate": 5.76942385108606e-06, "logits/chosen": -1.3766093254089355, "logits/rejected": -1.31516432762146, "logps/chosen": -34.07599639892578, "logps/rejected": -59.896453857421875, "loss": 0.2656, "rewards/accuracies": 1.0, "rewards/chosen": 2.0759284496307373, "rewards/margins": 0.36072230339050293, "rewards/rejected": 1.7152061462402344, "step": 4223 }, { "epoch": 0.93, "learning_rate": 5.76765282238976e-06, "logits/chosen": -1.7771698236465454, "logits/rejected": -1.7615433931350708, "logps/chosen": -62.87453842163086, "logps/rejected": -59.03959655761719, "loss": 1.1154, "rewards/accuracies": 0.0, "rewards/chosen": 3.8355824947357178, "rewards/margins": -0.9206392765045166, "rewards/rejected": 4.756221771240234, "step": 4224 }, { "epoch": 0.94, "learning_rate": 5.765881695051918e-06, "logits/chosen": -2.4198153018951416, "logits/rejected": -2.2945809364318848, "logps/chosen": -67.43820190429688, "logps/rejected": -158.21658325195312, "loss": 2.8051, "rewards/accuracies": 0.0, "rewards/chosen": 4.301790714263916, "rewards/margins": -4.252268314361572, "rewards/rejected": 8.554059028625488, "step": 4225 }, { "epoch": 0.94, "learning_rate": 5.764110469300118e-06, "logits/chosen": -1.39658784866333, "logits/rejected": -1.1349881887435913, "logps/chosen": -68.84930419921875, "logps/rejected": -90.36333465576172, "loss": 2.1089, "rewards/accuracies": 0.0, "rewards/chosen": 1.700112223625183, "rewards/margins": -3.4946823120117188, "rewards/rejected": 5.194794654846191, "step": 4226 }, { "epoch": 0.94, "learning_rate": 5.762339145361962e-06, "logits/chosen": -1.6482678651809692, "logits/rejected": -1.5673028230667114, "logps/chosen": -107.54804992675781, "logps/rejected": -131.53890991210938, "loss": 3.2628, "rewards/accuracies": 0.0, "rewards/chosen": 3.2597732543945312, "rewards/margins": -6.497766494750977, "rewards/rejected": 9.757539749145508, "step": 4227 }, { "epoch": 0.94, "learning_rate": 5.760567723465056e-06, "logits/chosen": -1.782373309135437, "logits/rejected": -1.7055128812789917, "logps/chosen": -104.05094909667969, "logps/rejected": -40.093833923339844, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 5.799163818359375, "rewards/margins": 3.6442549228668213, "rewards/rejected": 2.1549088954925537, "step": 4228 }, { "epoch": 0.94, "learning_rate": 5.7587962038370275e-06, "logits/chosen": -1.6233094930648804, "logits/rejected": -1.603810429573059, "logps/chosen": -100.05038452148438, "logps/rejected": -163.43548583984375, "loss": 2.0868, "rewards/accuracies": 0.0, "rewards/chosen": 5.7378249168396, "rewards/margins": -4.157145977020264, "rewards/rejected": 9.894970893859863, "step": 4229 }, { "epoch": 0.94, "learning_rate": 5.757024586705511e-06, "logits/chosen": -1.7081518173217773, "logits/rejected": -1.6265214681625366, "logps/chosen": -56.79399108886719, "logps/rejected": -137.62503051757812, "loss": 0.3803, "rewards/accuracies": 0.0, "rewards/chosen": 2.929114580154419, "rewards/margins": -0.11895370483398438, "rewards/rejected": 3.0480682849884033, "step": 4230 }, { "epoch": 0.94, "learning_rate": 5.755252872298154e-06, "logits/chosen": -1.57957124710083, "logits/rejected": -1.5175923109054565, "logps/chosen": -82.29058074951172, "logps/rejected": -65.67910766601562, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": 5.548816680908203, "rewards/margins": 3.7313575744628906, "rewards/rejected": 1.8174591064453125, "step": 4231 }, { "epoch": 0.94, "learning_rate": 5.7534810608426194e-06, "logits/chosen": -1.8919063806533813, "logits/rejected": -1.8583425283432007, "logps/chosen": -78.08714294433594, "logps/rejected": -125.28111267089844, "loss": 0.8656, "rewards/accuracies": 0.0, "rewards/chosen": 7.481929302215576, "rewards/margins": -1.3548064231872559, "rewards/rejected": 8.836735725402832, "step": 4232 }, { "epoch": 0.94, "learning_rate": 5.751709152566579e-06, "logits/chosen": -1.8508394956588745, "logits/rejected": -1.891671895980835, "logps/chosen": -96.93769836425781, "logps/rejected": -128.90977478027344, "loss": 0.5008, "rewards/accuracies": 0.0, "rewards/chosen": 6.578688144683838, "rewards/margins": -0.5395917892456055, "rewards/rejected": 7.118279933929443, "step": 4233 }, { "epoch": 0.94, "learning_rate": 5.74993714769772e-06, "logits/chosen": -1.776293396949768, "logits/rejected": -1.776293396949768, "logps/chosen": -60.8404541015625, "logps/rejected": -60.8404541015625, "loss": 0.3509, "rewards/accuracies": 0.0, "rewards/chosen": 6.800201416015625, "rewards/margins": 0.0, "rewards/rejected": 6.800201416015625, "step": 4234 }, { "epoch": 0.94, "learning_rate": 5.7481650464637394e-06, "logits/chosen": -1.4523649215698242, "logits/rejected": -1.362618327140808, "logps/chosen": -94.19621276855469, "logps/rejected": -60.557044982910156, "loss": 0.452, "rewards/accuracies": 1.0, "rewards/chosen": 7.717311382293701, "rewards/margins": 3.18239164352417, "rewards/rejected": 4.534919738769531, "step": 4235 }, { "epoch": 0.94, "learning_rate": 5.746392849092349e-06, "logits/chosen": -1.6677302122116089, "logits/rejected": -1.6301270723342896, "logps/chosen": -73.33057403564453, "logps/rejected": -68.09944152832031, "loss": 1.777, "rewards/accuracies": 0.0, "rewards/chosen": 3.7746498584747314, "rewards/margins": -3.2112557888031006, "rewards/rejected": 6.985905647277832, "step": 4236 }, { "epoch": 0.94, "learning_rate": 5.744620555811272e-06, "logits/chosen": -1.7088924646377563, "logits/rejected": -1.6638838052749634, "logps/chosen": -80.43865966796875, "logps/rejected": -38.49281311035156, "loss": 1.1974, "rewards/accuracies": 0.0, "rewards/chosen": 2.648210287094116, "rewards/margins": -0.01578354835510254, "rewards/rejected": 2.6639938354492188, "step": 4237 }, { "epoch": 0.94, "learning_rate": 5.742848166848244e-06, "logits/chosen": -1.5195871591567993, "logits/rejected": -1.5623669624328613, "logps/chosen": -128.71116638183594, "logps/rejected": -120.20279693603516, "loss": 0.0542, "rewards/accuracies": 1.0, "rewards/chosen": 8.317588806152344, "rewards/margins": 2.1790199279785156, "rewards/rejected": 6.138568878173828, "step": 4238 }, { "epoch": 0.94, "learning_rate": 5.741075682431013e-06, "logits/chosen": -1.7475334405899048, "logits/rejected": -1.7178311347961426, "logps/chosen": -31.9600772857666, "logps/rejected": -63.61233901977539, "loss": 0.4541, "rewards/accuracies": 0.0, "rewards/chosen": 3.137600898742676, "rewards/margins": -0.09634184837341309, "rewards/rejected": 3.233942747116089, "step": 4239 }, { "epoch": 0.94, "learning_rate": 5.739303102787338e-06, "logits/chosen": -1.6402101516723633, "logits/rejected": -1.6311140060424805, "logps/chosen": -67.85240936279297, "logps/rejected": -42.1629524230957, "loss": 0.5458, "rewards/accuracies": 1.0, "rewards/chosen": 1.867775797843933, "rewards/margins": 0.573733925819397, "rewards/rejected": 1.2940418720245361, "step": 4240 }, { "epoch": 0.94, "learning_rate": 5.73753042814499e-06, "logits/chosen": -1.3618299961090088, "logits/rejected": -1.2848401069641113, "logps/chosen": -34.19430923461914, "logps/rejected": -49.906700134277344, "loss": 1.0279, "rewards/accuracies": 0.0, "rewards/chosen": 1.5264781713485718, "rewards/margins": -1.2096515893936157, "rewards/rejected": 2.7361297607421875, "step": 4241 }, { "epoch": 0.94, "learning_rate": 5.735757658731757e-06, "logits/chosen": -1.7342880964279175, "logits/rejected": -1.7342880964279175, "logps/chosen": -35.727867126464844, "logps/rejected": -35.727867126464844, "loss": 0.6648, "rewards/accuracies": 0.0, "rewards/chosen": 2.9432945251464844, "rewards/margins": 0.0, "rewards/rejected": 2.9432945251464844, "step": 4242 }, { "epoch": 0.94, "learning_rate": 5.7339847947754334e-06, "logits/chosen": -1.3452751636505127, "logits/rejected": -1.3520116806030273, "logps/chosen": -53.290367126464844, "logps/rejected": -40.780784606933594, "loss": 0.1562, "rewards/accuracies": 1.0, "rewards/chosen": 2.9207725524902344, "rewards/margins": 1.0742202997207642, "rewards/rejected": 1.8465522527694702, "step": 4243 }, { "epoch": 0.94, "learning_rate": 5.732211836503827e-06, "logits/chosen": -1.8942490816116333, "logits/rejected": -1.913801670074463, "logps/chosen": -73.92221069335938, "logps/rejected": -42.89706039428711, "loss": 3.0799, "rewards/accuracies": 0.0, "rewards/chosen": 2.801800489425659, "rewards/margins": -2.7516071796417236, "rewards/rejected": 5.553407669067383, "step": 4244 }, { "epoch": 0.94, "learning_rate": 5.73043878414476e-06, "logits/chosen": -1.6423181295394897, "logits/rejected": -1.5492932796478271, "logps/chosen": -90.75895690917969, "logps/rejected": -73.16326904296875, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": 6.621618747711182, "rewards/margins": 3.167105197906494, "rewards/rejected": 3.4545135498046875, "step": 4245 }, { "epoch": 0.94, "learning_rate": 5.728665637926065e-06, "logits/chosen": -1.5340639352798462, "logits/rejected": -1.5340639352798462, "logps/chosen": -1.609544277191162, "logps/rejected": -1.609544277191162, "loss": 0.7997, "rewards/accuracies": 0.0, "rewards/chosen": 1.3302866220474243, "rewards/margins": 0.0, "rewards/rejected": 1.3302866220474243, "step": 4246 }, { "epoch": 0.94, "learning_rate": 5.726892398075588e-06, "logits/chosen": -1.6473597288131714, "logits/rejected": -1.6306949853897095, "logps/chosen": -91.13556671142578, "logps/rejected": -50.82375717163086, "loss": 0.6482, "rewards/accuracies": 0.0, "rewards/chosen": 3.5932717323303223, "rewards/margins": -0.4980611801147461, "rewards/rejected": 4.091332912445068, "step": 4247 }, { "epoch": 0.94, "learning_rate": 5.725119064821185e-06, "logits/chosen": -1.532443881034851, "logits/rejected": -1.458377718925476, "logps/chosen": -58.31755447387695, "logps/rejected": -29.581867218017578, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 2.681251287460327, "rewards/margins": 1.8972501754760742, "rewards/rejected": 0.7840011715888977, "step": 4248 }, { "epoch": 0.94, "learning_rate": 5.723345638390725e-06, "logits/chosen": -1.8231257200241089, "logits/rejected": -1.6488193273544312, "logps/chosen": -109.056396484375, "logps/rejected": -22.88393783569336, "loss": 0.0812, "rewards/accuracies": 1.0, "rewards/chosen": 6.03518533706665, "rewards/margins": 4.651972770690918, "rewards/rejected": 1.3832123279571533, "step": 4249 }, { "epoch": 0.94, "learning_rate": 5.721572119012089e-06, "logits/chosen": -1.8256033658981323, "logits/rejected": -1.8942772150039673, "logps/chosen": -168.54739379882812, "logps/rejected": -97.90109252929688, "loss": 0.5477, "rewards/accuracies": 0.0, "rewards/chosen": 8.042119026184082, "rewards/margins": -0.686619758605957, "rewards/rejected": 8.728738784790039, "step": 4250 }, { "epoch": 0.94, "learning_rate": 5.71979850691317e-06, "logits/chosen": -1.6814570426940918, "logits/rejected": -1.56594979763031, "logps/chosen": -70.38829040527344, "logps/rejected": -58.67323303222656, "loss": 0.4834, "rewards/accuracies": 1.0, "rewards/chosen": 4.2155351638793945, "rewards/margins": 0.47431349754333496, "rewards/rejected": 3.7412216663360596, "step": 4251 }, { "epoch": 0.94, "learning_rate": 5.718024802321875e-06, "logits/chosen": -1.8151252269744873, "logits/rejected": -1.7851943969726562, "logps/chosen": -156.5235595703125, "logps/rejected": -74.10801696777344, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": 7.981671333312988, "rewards/margins": 2.8818039894104004, "rewards/rejected": 5.099867343902588, "step": 4252 }, { "epoch": 0.94, "learning_rate": 5.716251005466118e-06, "logits/chosen": -1.5604183673858643, "logits/rejected": -1.544907569885254, "logps/chosen": -44.012001037597656, "logps/rejected": -58.39291000366211, "loss": 1.2377, "rewards/accuracies": 0.0, "rewards/chosen": 1.9656482934951782, "rewards/margins": -2.030592441558838, "rewards/rejected": 3.9962406158447266, "step": 4253 }, { "epoch": 0.94, "learning_rate": 5.714477116573828e-06, "logits/chosen": -1.9298529624938965, "logits/rejected": -1.9101831912994385, "logps/chosen": -157.39926147460938, "logps/rejected": -56.184547424316406, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": 8.355567932128906, "rewards/margins": 5.4718756675720215, "rewards/rejected": 2.8836922645568848, "step": 4254 }, { "epoch": 0.94, "learning_rate": 5.712703135872947e-06, "logits/chosen": -1.6880502700805664, "logits/rejected": -1.6853001117706299, "logps/chosen": -71.5612564086914, "logps/rejected": -57.36839294433594, "loss": 0.2163, "rewards/accuracies": 1.0, "rewards/chosen": 3.313720703125, "rewards/margins": 0.6560103893280029, "rewards/rejected": 2.657710313796997, "step": 4255 }, { "epoch": 0.94, "learning_rate": 5.710929063591426e-06, "logits/chosen": -1.4459218978881836, "logits/rejected": -1.4850184917449951, "logps/chosen": -34.32998275756836, "logps/rejected": -69.79271697998047, "loss": 0.8394, "rewards/accuracies": 0.0, "rewards/chosen": 2.395251512527466, "rewards/margins": -0.46748876571655273, "rewards/rejected": 2.8627402782440186, "step": 4256 }, { "epoch": 0.94, "learning_rate": 5.709154899957229e-06, "logits/chosen": -1.627666711807251, "logits/rejected": -1.5446691513061523, "logps/chosen": -156.43466186523438, "logps/rejected": -152.5352325439453, "loss": 2.0988, "rewards/accuracies": 0.0, "rewards/chosen": 5.693992614746094, "rewards/margins": -4.095819473266602, "rewards/rejected": 9.789812088012695, "step": 4257 }, { "epoch": 0.94, "learning_rate": 5.7073806451983325e-06, "logits/chosen": -1.7445707321166992, "logits/rejected": -1.7150816917419434, "logps/chosen": -42.9992561340332, "logps/rejected": -39.83706283569336, "loss": 0.4471, "rewards/accuracies": 0.0, "rewards/chosen": 2.4547009468078613, "rewards/margins": -0.24796509742736816, "rewards/rejected": 2.7026660442352295, "step": 4258 }, { "epoch": 0.94, "learning_rate": 5.705606299542723e-06, "logits/chosen": -1.6885132789611816, "logits/rejected": -1.730400800704956, "logps/chosen": -21.188100814819336, "logps/rejected": -49.89945602416992, "loss": 1.2523, "rewards/accuracies": 0.0, "rewards/chosen": 2.4706358909606934, "rewards/margins": -1.8717961311340332, "rewards/rejected": 4.342432022094727, "step": 4259 }, { "epoch": 0.94, "learning_rate": 5.703831863218401e-06, "logits/chosen": -1.6828234195709229, "logits/rejected": -1.695713996887207, "logps/chosen": -48.19472122192383, "logps/rejected": -54.55512237548828, "loss": 0.3085, "rewards/accuracies": 1.0, "rewards/chosen": 4.45039701461792, "rewards/margins": 0.1632075309753418, "rewards/rejected": 4.287189483642578, "step": 4260 }, { "epoch": 0.94, "learning_rate": 5.702057336453376e-06, "logits/chosen": -1.3739994764328003, "logits/rejected": -1.3675240278244019, "logps/chosen": -8.448089599609375, "logps/rejected": -5.482504844665527, "loss": 0.8049, "rewards/accuracies": 0.0, "rewards/chosen": 0.6852337121963501, "rewards/margins": -0.22607868909835815, "rewards/rejected": 0.9113124012947083, "step": 4261 }, { "epoch": 0.94, "learning_rate": 5.700282719475672e-06, "logits/chosen": -1.5557812452316284, "logits/rejected": -1.444186806678772, "logps/chosen": -61.24076843261719, "logps/rejected": -119.75386047363281, "loss": 3.0703, "rewards/accuracies": 0.0, "rewards/chosen": 2.0554306507110596, "rewards/margins": -2.7534945011138916, "rewards/rejected": 4.808925151824951, "step": 4262 }, { "epoch": 0.94, "learning_rate": 5.698508012513321e-06, "logits/chosen": -1.7428972721099854, "logits/rejected": -1.6460715532302856, "logps/chosen": -114.57273864746094, "logps/rejected": -57.777809143066406, "loss": 0.1034, "rewards/accuracies": 1.0, "rewards/chosen": 5.787173748016357, "rewards/margins": 1.4895992279052734, "rewards/rejected": 4.297574520111084, "step": 4263 }, { "epoch": 0.94, "learning_rate": 5.69673321579437e-06, "logits/chosen": -1.718013048171997, "logits/rejected": -1.375075340270996, "logps/chosen": -145.8316192626953, "logps/rejected": -45.97519302368164, "loss": 0.1732, "rewards/accuracies": 1.0, "rewards/chosen": 4.058982849121094, "rewards/margins": 2.3989577293395996, "rewards/rejected": 1.6600250005722046, "step": 4264 }, { "epoch": 0.94, "learning_rate": 5.694958329546875e-06, "logits/chosen": -1.9758563041687012, "logits/rejected": -2.011436700820923, "logps/chosen": -69.04327392578125, "logps/rejected": -119.59376525878906, "loss": 0.8505, "rewards/accuracies": 0.0, "rewards/chosen": 6.298150539398193, "rewards/margins": -1.4020509719848633, "rewards/rejected": 7.700201511383057, "step": 4265 }, { "epoch": 0.94, "learning_rate": 5.693183353998906e-06, "logits/chosen": -1.9547220468521118, "logits/rejected": -1.9432892799377441, "logps/chosen": -94.0498046875, "logps/rejected": -61.93690490722656, "loss": 1.3684, "rewards/accuracies": 0.0, "rewards/chosen": 4.089718818664551, "rewards/margins": -1.3364853858947754, "rewards/rejected": 5.426204204559326, "step": 4266 }, { "epoch": 0.94, "learning_rate": 5.691408289378542e-06, "logits/chosen": -1.3775633573532104, "logits/rejected": -1.3775633573532104, "logps/chosen": -90.80361938476562, "logps/rejected": -90.80361938476562, "loss": 0.3473, "rewards/accuracies": 0.0, "rewards/chosen": 2.9206109046936035, "rewards/margins": 0.0, "rewards/rejected": 2.9206109046936035, "step": 4267 }, { "epoch": 0.94, "learning_rate": 5.689633135913874e-06, "logits/chosen": -1.462897539138794, "logits/rejected": -1.4755420684814453, "logps/chosen": -147.2747802734375, "logps/rejected": -83.748046875, "loss": 1.736, "rewards/accuracies": 0.0, "rewards/chosen": 2.6397156715393066, "rewards/margins": -3.275364875793457, "rewards/rejected": 5.915080547332764, "step": 4268 }, { "epoch": 0.94, "learning_rate": 5.687857893833008e-06, "logits/chosen": -1.5117934942245483, "logits/rejected": -1.4670376777648926, "logps/chosen": -80.96455383300781, "logps/rejected": -80.39989471435547, "loss": 0.5273, "rewards/accuracies": 1.0, "rewards/chosen": 3.405895948410034, "rewards/margins": 0.6985511779785156, "rewards/rejected": 2.7073447704315186, "step": 4269 }, { "epoch": 0.95, "learning_rate": 5.686082563364055e-06, "logits/chosen": -1.9255620241165161, "logits/rejected": -1.8800233602523804, "logps/chosen": -66.8683090209961, "logps/rejected": -38.80229187011719, "loss": 0.2926, "rewards/accuracies": 1.0, "rewards/chosen": 3.151024580001831, "rewards/margins": 0.23272323608398438, "rewards/rejected": 2.9183013439178467, "step": 4270 }, { "epoch": 0.95, "learning_rate": 5.684307144735142e-06, "logits/chosen": -1.4396220445632935, "logits/rejected": -1.386656403541565, "logps/chosen": -56.130516052246094, "logps/rejected": -146.3854522705078, "loss": 1.5771, "rewards/accuracies": 0.0, "rewards/chosen": 4.71962308883667, "rewards/margins": -3.0931410789489746, "rewards/rejected": 7.8127641677856445, "step": 4271 }, { "epoch": 0.95, "learning_rate": 5.682531638174407e-06, "logits/chosen": -1.5224251747131348, "logits/rejected": -1.5656890869140625, "logps/chosen": -64.9528579711914, "logps/rejected": -68.70812225341797, "loss": 1.8594, "rewards/accuracies": 0.0, "rewards/chosen": 3.2198891639709473, "rewards/margins": -3.6048154830932617, "rewards/rejected": 6.824704647064209, "step": 4272 }, { "epoch": 0.95, "learning_rate": 5.680756043909996e-06, "logits/chosen": -1.70302414894104, "logits/rejected": -1.70302414894104, "logps/chosen": -60.259517669677734, "logps/rejected": -60.259517669677734, "loss": 0.3545, "rewards/accuracies": 0.0, "rewards/chosen": 5.437723159790039, "rewards/margins": 0.0, "rewards/rejected": 5.437723159790039, "step": 4273 }, { "epoch": 0.95, "learning_rate": 5.67898036217007e-06, "logits/chosen": -1.340987205505371, "logits/rejected": -1.3043352365493774, "logps/chosen": -107.2298355102539, "logps/rejected": -80.73515319824219, "loss": 0.2793, "rewards/accuracies": 1.0, "rewards/chosen": 4.0266947746276855, "rewards/margins": 0.3114511966705322, "rewards/rejected": 3.7152435779571533, "step": 4274 }, { "epoch": 0.95, "learning_rate": 5.677204593182799e-06, "logits/chosen": -1.486650824546814, "logits/rejected": -1.5218456983566284, "logps/chosen": -53.88673782348633, "logps/rejected": -63.8559684753418, "loss": 0.0923, "rewards/accuracies": 1.0, "rewards/chosen": 4.610767841339111, "rewards/margins": 2.1634559631347656, "rewards/rejected": 2.4473118782043457, "step": 4275 }, { "epoch": 0.95, "learning_rate": 5.675428737176367e-06, "logits/chosen": -1.676360845565796, "logits/rejected": -1.7623518705368042, "logps/chosen": -36.4476318359375, "logps/rejected": -82.32118225097656, "loss": 0.6476, "rewards/accuracies": 0.0, "rewards/chosen": 3.945768117904663, "rewards/margins": -0.9667608737945557, "rewards/rejected": 4.912528991699219, "step": 4276 }, { "epoch": 0.95, "learning_rate": 5.673652794378965e-06, "logits/chosen": -1.701810359954834, "logits/rejected": -1.7045460939407349, "logps/chosen": -87.18008422851562, "logps/rejected": -80.22628784179688, "loss": 3.004, "rewards/accuracies": 0.0, "rewards/chosen": 2.531752109527588, "rewards/margins": -5.493744373321533, "rewards/rejected": 8.025496482849121, "step": 4277 }, { "epoch": 0.95, "learning_rate": 5.671876765018799e-06, "logits/chosen": -1.6316521167755127, "logits/rejected": -1.6316521167755127, "logps/chosen": -47.70627212524414, "logps/rejected": -47.70627212524414, "loss": 1.0876, "rewards/accuracies": 0.0, "rewards/chosen": 3.6644344329833984, "rewards/margins": 0.0, "rewards/rejected": 3.6644344329833984, "step": 4278 }, { "epoch": 0.95, "learning_rate": 5.670100649324083e-06, "logits/chosen": -1.4709891080856323, "logits/rejected": -1.4285556077957153, "logps/chosen": -89.11077880859375, "logps/rejected": -82.3508529663086, "loss": 0.9912, "rewards/accuracies": 0.0, "rewards/chosen": 1.887043833732605, "rewards/margins": -0.5934745073318481, "rewards/rejected": 2.480518341064453, "step": 4279 }, { "epoch": 0.95, "learning_rate": 5.668324447523046e-06, "logits/chosen": -1.4542453289031982, "logits/rejected": -1.3928098678588867, "logps/chosen": -63.92057800292969, "logps/rejected": -66.35066986083984, "loss": 0.1909, "rewards/accuracies": 1.0, "rewards/chosen": 3.6343743801116943, "rewards/margins": 1.7790894508361816, "rewards/rejected": 1.8552849292755127, "step": 4280 }, { "epoch": 0.95, "learning_rate": 5.666548159843923e-06, "logits/chosen": -1.335451602935791, "logits/rejected": -1.253839373588562, "logps/chosen": -44.866455078125, "logps/rejected": -49.301048278808594, "loss": 1.903, "rewards/accuracies": 0.0, "rewards/chosen": 3.068194627761841, "rewards/margins": -0.35726308822631836, "rewards/rejected": 3.425457715988159, "step": 4281 }, { "epoch": 0.95, "learning_rate": 5.664771786514963e-06, "logits/chosen": -1.5307765007019043, "logits/rejected": -1.463979721069336, "logps/chosen": -102.6454849243164, "logps/rejected": -96.84588623046875, "loss": 0.3708, "rewards/accuracies": 1.0, "rewards/chosen": 6.051262855529785, "rewards/margins": 2.84328556060791, "rewards/rejected": 3.207977294921875, "step": 4282 }, { "epoch": 0.95, "learning_rate": 5.662995327764429e-06, "logits/chosen": -1.2561872005462646, "logits/rejected": -1.2032736539840698, "logps/chosen": -19.416501998901367, "logps/rejected": -8.492652893066406, "loss": 0.1454, "rewards/accuracies": 1.0, "rewards/chosen": 2.6748483180999756, "rewards/margins": 1.807221531867981, "rewards/rejected": 0.8676267862319946, "step": 4283 }, { "epoch": 0.95, "learning_rate": 5.661218783820587e-06, "logits/chosen": -1.7820589542388916, "logits/rejected": -1.757814884185791, "logps/chosen": -39.11772155761719, "logps/rejected": -87.59042358398438, "loss": 1.8662, "rewards/accuracies": 1.0, "rewards/chosen": 2.8756988048553467, "rewards/margins": 0.4512624740600586, "rewards/rejected": 2.424436330795288, "step": 4284 }, { "epoch": 0.95, "learning_rate": 5.6594421549117215e-06, "logits/chosen": -1.6231054067611694, "logits/rejected": -1.5350849628448486, "logps/chosen": -82.34114837646484, "logps/rejected": -40.70876693725586, "loss": 0.9142, "rewards/accuracies": 1.0, "rewards/chosen": 3.4123101234436035, "rewards/margins": 0.3125736713409424, "rewards/rejected": 3.099736452102661, "step": 4285 }, { "epoch": 0.95, "learning_rate": 5.657665441266124e-06, "logits/chosen": -1.828436017036438, "logits/rejected": -1.718327283859253, "logps/chosen": -128.06024169921875, "logps/rejected": -56.299842834472656, "loss": 0.3166, "rewards/accuracies": 1.0, "rewards/chosen": 5.967816352844238, "rewards/margins": 2.143338918685913, "rewards/rejected": 3.824477434158325, "step": 4286 }, { "epoch": 0.95, "learning_rate": 5.655888643112099e-06, "logits/chosen": -1.5982004404067993, "logits/rejected": -1.5807664394378662, "logps/chosen": -78.60179901123047, "logps/rejected": -57.6357421875, "loss": 0.5305, "rewards/accuracies": 0.0, "rewards/chosen": 1.831311821937561, "rewards/margins": -0.5045434236526489, "rewards/rejected": 2.33585524559021, "step": 4287 }, { "epoch": 0.95, "learning_rate": 5.6541117606779585e-06, "logits/chosen": -1.380780577659607, "logits/rejected": -1.3694580793380737, "logps/chosen": -36.43470764160156, "logps/rejected": -37.95074462890625, "loss": 0.4282, "rewards/accuracies": 1.0, "rewards/chosen": 2.548647403717041, "rewards/margins": 0.9505921602249146, "rewards/rejected": 1.5980552434921265, "step": 4288 }, { "epoch": 0.95, "learning_rate": 5.6523347941920316e-06, "logits/chosen": -1.5247926712036133, "logits/rejected": -1.2753828763961792, "logps/chosen": -32.60863494873047, "logps/rejected": -46.3767204284668, "loss": 0.3834, "rewards/accuracies": 1.0, "rewards/chosen": 2.165851593017578, "rewards/margins": 0.4995594024658203, "rewards/rejected": 1.6662921905517578, "step": 4289 }, { "epoch": 0.95, "learning_rate": 5.65055774388265e-06, "logits/chosen": -1.891993761062622, "logits/rejected": -1.89808189868927, "logps/chosen": -66.07181549072266, "logps/rejected": -42.158409118652344, "loss": 1.4894, "rewards/accuracies": 0.0, "rewards/chosen": 2.97332763671875, "rewards/margins": -2.6383728981018066, "rewards/rejected": 5.611700534820557, "step": 4290 }, { "epoch": 0.95, "learning_rate": 5.648780609978162e-06, "logits/chosen": -1.6850013732910156, "logits/rejected": -1.702515721321106, "logps/chosen": -38.27942657470703, "logps/rejected": -43.555519104003906, "loss": 1.4582, "rewards/accuracies": 0.0, "rewards/chosen": 2.755140781402588, "rewards/margins": -0.32665085792541504, "rewards/rejected": 3.081791639328003, "step": 4291 }, { "epoch": 0.95, "learning_rate": 5.647003392706927e-06, "logits/chosen": -1.1722755432128906, "logits/rejected": -1.1375491619110107, "logps/chosen": -45.92772674560547, "logps/rejected": -32.018638610839844, "loss": 0.7133, "rewards/accuracies": 0.0, "rewards/chosen": 1.9807456731796265, "rewards/margins": -1.0964287519454956, "rewards/rejected": 3.077174425125122, "step": 4292 }, { "epoch": 0.95, "learning_rate": 5.64522609229731e-06, "logits/chosen": -1.6668496131896973, "logits/rejected": -1.565709114074707, "logps/chosen": -98.7681884765625, "logps/rejected": -62.42833709716797, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 7.6695709228515625, "rewards/margins": 5.156158447265625, "rewards/rejected": 2.5134124755859375, "step": 4293 }, { "epoch": 0.95, "learning_rate": 5.643448708977692e-06, "logits/chosen": -1.4101001024246216, "logits/rejected": -1.3899680376052856, "logps/chosen": -35.1062126159668, "logps/rejected": -35.651573181152344, "loss": 0.57, "rewards/accuracies": 0.0, "rewards/chosen": 1.5209015607833862, "rewards/margins": -0.40453100204467773, "rewards/rejected": 1.925432562828064, "step": 4294 }, { "epoch": 0.95, "learning_rate": 5.641671242976461e-06, "logits/chosen": -1.7541459798812866, "logits/rejected": -1.6338045597076416, "logps/chosen": -129.65476989746094, "logps/rejected": -20.7786865234375, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": 6.689195156097412, "rewards/margins": 5.347240447998047, "rewards/rejected": 1.3419548273086548, "step": 4295 }, { "epoch": 0.95, "learning_rate": 5.639893694522018e-06, "logits/chosen": -1.3738807439804077, "logits/rejected": -1.3778421878814697, "logps/chosen": -50.0783805847168, "logps/rejected": -103.97581481933594, "loss": 3.085, "rewards/accuracies": 0.0, "rewards/chosen": 2.706937074661255, "rewards/margins": -5.295544624328613, "rewards/rejected": 8.002481460571289, "step": 4296 }, { "epoch": 0.95, "learning_rate": 5.638116063842774e-06, "logits/chosen": -1.8363428115844727, "logits/rejected": -1.6777900457382202, "logps/chosen": -127.44264221191406, "logps/rejected": -73.86871337890625, "loss": 0.1372, "rewards/accuracies": 1.0, "rewards/chosen": 8.376260757446289, "rewards/margins": 1.3779282569885254, "rewards/rejected": 6.998332500457764, "step": 4297 }, { "epoch": 0.95, "learning_rate": 5.63633835116715e-06, "logits/chosen": -1.4684768915176392, "logits/rejected": -1.420366883277893, "logps/chosen": -62.83558654785156, "logps/rejected": -52.180381774902344, "loss": 0.5188, "rewards/accuracies": 0.0, "rewards/chosen": 3.852534532546997, "rewards/margins": -0.08228230476379395, "rewards/rejected": 3.934816837310791, "step": 4298 }, { "epoch": 0.95, "learning_rate": 5.634560556723576e-06, "logits/chosen": -2.000453233718872, "logits/rejected": -2.0064079761505127, "logps/chosen": -68.3872299194336, "logps/rejected": -22.59598731994629, "loss": 0.0796, "rewards/accuracies": 1.0, "rewards/chosen": 6.482089996337891, "rewards/margins": 3.263207197189331, "rewards/rejected": 3.2188827991485596, "step": 4299 }, { "epoch": 0.95, "learning_rate": 5.632782680740497e-06, "logits/chosen": -1.4817899465560913, "logits/rejected": -1.3297231197357178, "logps/chosen": -89.534912109375, "logps/rejected": -100.08052062988281, "loss": 1.0924, "rewards/accuracies": 0.0, "rewards/chosen": 6.687025547027588, "rewards/margins": -2.055746555328369, "rewards/rejected": 8.742772102355957, "step": 4300 }, { "epoch": 0.95, "learning_rate": 5.631004723446365e-06, "logits/chosen": -1.510206937789917, "logits/rejected": -1.469738483428955, "logps/chosen": -80.34774780273438, "logps/rejected": -64.60753631591797, "loss": 0.143, "rewards/accuracies": 1.0, "rewards/chosen": 4.999586582183838, "rewards/margins": 2.955981492996216, "rewards/rejected": 2.043605089187622, "step": 4301 }, { "epoch": 0.95, "learning_rate": 5.629226685069642e-06, "logits/chosen": -1.3235485553741455, "logits/rejected": -1.1941677331924438, "logps/chosen": -50.75518035888672, "logps/rejected": -11.545843124389648, "loss": 0.1609, "rewards/accuracies": 1.0, "rewards/chosen": 4.45361852645874, "rewards/margins": 3.0833168029785156, "rewards/rejected": 1.370301604270935, "step": 4302 }, { "epoch": 0.95, "learning_rate": 5.627448565838804e-06, "logits/chosen": -1.7154555320739746, "logits/rejected": -1.6518700122833252, "logps/chosen": -67.0463638305664, "logps/rejected": -56.33246612548828, "loss": 0.6554, "rewards/accuracies": 0.0, "rewards/chosen": 3.084514617919922, "rewards/margins": -0.9585480690002441, "rewards/rejected": 4.043062686920166, "step": 4303 }, { "epoch": 0.95, "learning_rate": 5.625670365982332e-06, "logits/chosen": -1.4785120487213135, "logits/rejected": -1.4584821462631226, "logps/chosen": -49.89693832397461, "logps/rejected": -40.11481475830078, "loss": 0.4343, "rewards/accuracies": 0.0, "rewards/chosen": 2.3826115131378174, "rewards/margins": -0.26541852951049805, "rewards/rejected": 2.6480300426483154, "step": 4304 }, { "epoch": 0.95, "learning_rate": 5.623892085728722e-06, "logits/chosen": -1.5731054544448853, "logits/rejected": -1.5092047452926636, "logps/chosen": -114.77568817138672, "logps/rejected": -63.446022033691406, "loss": 0.0732, "rewards/accuracies": 1.0, "rewards/chosen": 4.729978084564209, "rewards/margins": 1.8657186031341553, "rewards/rejected": 2.8642594814300537, "step": 4305 }, { "epoch": 0.95, "learning_rate": 5.6221137253064816e-06, "logits/chosen": -1.8414562940597534, "logits/rejected": -1.8544089794158936, "logps/chosen": -38.2689323425293, "logps/rejected": -113.58918762207031, "loss": 1.1289, "rewards/accuracies": 1.0, "rewards/chosen": 2.682560443878174, "rewards/margins": 0.0605771541595459, "rewards/rejected": 2.621983289718628, "step": 4306 }, { "epoch": 0.95, "learning_rate": 5.620335284944121e-06, "logits/chosen": -1.5592920780181885, "logits/rejected": -1.6179862022399902, "logps/chosen": -134.41993713378906, "logps/rejected": -161.404052734375, "loss": 1.0764, "rewards/accuracies": 0.0, "rewards/chosen": 6.666792392730713, "rewards/margins": -2.005445957183838, "rewards/rejected": 8.67223834991455, "step": 4307 }, { "epoch": 0.95, "learning_rate": 5.6185567648701664e-06, "logits/chosen": -1.5620861053466797, "logits/rejected": -1.5741868019104004, "logps/chosen": -113.46115112304688, "logps/rejected": -108.55963134765625, "loss": 0.6541, "rewards/accuracies": 1.0, "rewards/chosen": 6.2158403396606445, "rewards/margins": 0.6600480079650879, "rewards/rejected": 5.555792331695557, "step": 4308 }, { "epoch": 0.95, "learning_rate": 5.616778165313156e-06, "logits/chosen": -1.68497633934021, "logits/rejected": -1.5109189748764038, "logps/chosen": -102.87422180175781, "logps/rejected": -46.02269744873047, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": 8.058097839355469, "rewards/margins": 3.8441948890686035, "rewards/rejected": 4.213902950286865, "step": 4309 }, { "epoch": 0.95, "learning_rate": 5.614999486501633e-06, "logits/chosen": -1.4362002611160278, "logits/rejected": -1.3692448139190674, "logps/chosen": -173.3916473388672, "logps/rejected": -138.75277709960938, "loss": 0.1638, "rewards/accuracies": 1.0, "rewards/chosen": 7.050572395324707, "rewards/margins": 0.950810432434082, "rewards/rejected": 6.099761962890625, "step": 4310 }, { "epoch": 0.95, "learning_rate": 5.613220728664155e-06, "logits/chosen": -1.2304266691207886, "logits/rejected": -1.1665160655975342, "logps/chosen": -36.06950759887695, "logps/rejected": -40.57243347167969, "loss": 0.454, "rewards/accuracies": 1.0, "rewards/chosen": 2.9843685626983643, "rewards/margins": 0.023362398147583008, "rewards/rejected": 2.9610061645507812, "step": 4311 }, { "epoch": 0.95, "learning_rate": 5.611441892029287e-06, "logits/chosen": -1.482893466949463, "logits/rejected": -1.4490666389465332, "logps/chosen": -97.4047622680664, "logps/rejected": -71.28507995605469, "loss": 0.0796, "rewards/accuracies": 1.0, "rewards/chosen": 5.932693004608154, "rewards/margins": 3.0485613346099854, "rewards/rejected": 2.884131669998169, "step": 4312 }, { "epoch": 0.95, "learning_rate": 5.6096629768256075e-06, "logits/chosen": -1.5874558687210083, "logits/rejected": -1.5203709602355957, "logps/chosen": -71.5245361328125, "logps/rejected": -58.12388610839844, "loss": 0.8514, "rewards/accuracies": 0.0, "rewards/chosen": 1.0659561157226562, "rewards/margins": -1.5014550685882568, "rewards/rejected": 2.567411184310913, "step": 4313 }, { "epoch": 0.95, "learning_rate": 5.6078839832817004e-06, "logits/chosen": -1.6384040117263794, "logits/rejected": -1.5028399229049683, "logps/chosen": -52.64386749267578, "logps/rejected": -79.08934020996094, "loss": 2.3556, "rewards/accuracies": 0.0, "rewards/chosen": 2.118994951248169, "rewards/margins": -3.623270273208618, "rewards/rejected": 5.742265224456787, "step": 4314 }, { "epoch": 0.96, "learning_rate": 5.6061049116261625e-06, "logits/chosen": -1.742017149925232, "logits/rejected": -1.7339073419570923, "logps/chosen": -53.09337615966797, "logps/rejected": -73.71920776367188, "loss": 0.416, "rewards/accuracies": 0.0, "rewards/chosen": 2.9642059803009033, "rewards/margins": -0.07813262939453125, "rewards/rejected": 3.0423386096954346, "step": 4315 }, { "epoch": 0.96, "learning_rate": 5.6043257620876e-06, "logits/chosen": -1.4612959623336792, "logits/rejected": -1.3532309532165527, "logps/chosen": -58.16396713256836, "logps/rejected": -13.972434043884277, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 7.618461608886719, "rewards/margins": 5.57275390625, "rewards/rejected": 2.0457077026367188, "step": 4316 }, { "epoch": 0.96, "learning_rate": 5.602546534894629e-06, "logits/chosen": -1.5215760469436646, "logits/rejected": -1.5097157955169678, "logps/chosen": -37.53216552734375, "logps/rejected": -72.98654174804688, "loss": 0.7631, "rewards/accuracies": 0.0, "rewards/chosen": 2.7142090797424316, "rewards/margins": -1.2544875144958496, "rewards/rejected": 3.9686965942382812, "step": 4317 }, { "epoch": 0.96, "learning_rate": 5.600767230275878e-06, "logits/chosen": -1.6386915445327759, "logits/rejected": -1.6386915445327759, "logps/chosen": -41.79985046386719, "logps/rejected": -41.79985046386719, "loss": 0.438, "rewards/accuracies": 0.0, "rewards/chosen": 2.3348801136016846, "rewards/margins": 0.0, "rewards/rejected": 2.3348801136016846, "step": 4318 }, { "epoch": 0.96, "learning_rate": 5.598987848459982e-06, "logits/chosen": -1.820172667503357, "logits/rejected": -1.8031044006347656, "logps/chosen": -88.51942443847656, "logps/rejected": -81.78511047363281, "loss": 0.5043, "rewards/accuracies": 0.0, "rewards/chosen": 6.191511631011963, "rewards/margins": -0.4799180030822754, "rewards/rejected": 6.671429634094238, "step": 4319 }, { "epoch": 0.96, "learning_rate": 5.597208389675587e-06, "logits/chosen": -1.7284318208694458, "logits/rejected": -1.5679270029067993, "logps/chosen": -153.16583251953125, "logps/rejected": -145.35531616210938, "loss": 0.153, "rewards/accuracies": 1.0, "rewards/chosen": 7.961090087890625, "rewards/margins": 1.0998501777648926, "rewards/rejected": 6.861239910125732, "step": 4320 }, { "epoch": 0.96, "learning_rate": 5.59542885415135e-06, "logits/chosen": -1.5039489269256592, "logits/rejected": -1.5753552913665771, "logps/chosen": -90.89291381835938, "logps/rejected": -75.39425659179688, "loss": 3.2488, "rewards/accuracies": 0.0, "rewards/chosen": 2.7028794288635254, "rewards/margins": -6.4927449226379395, "rewards/rejected": 9.195624351501465, "step": 4321 }, { "epoch": 0.96, "learning_rate": 5.5936492421159374e-06, "logits/chosen": -1.7072781324386597, "logits/rejected": -1.6369662284851074, "logps/chosen": -152.55250549316406, "logps/rejected": -165.00961303710938, "loss": 1.0657, "rewards/accuracies": 1.0, "rewards/chosen": 6.886605739593506, "rewards/margins": 1.040295124053955, "rewards/rejected": 5.846310615539551, "step": 4322 }, { "epoch": 0.96, "learning_rate": 5.5918695537980225e-06, "logits/chosen": -1.965535044670105, "logits/rejected": -1.898121953010559, "logps/chosen": -47.850948333740234, "logps/rejected": -46.93498229980469, "loss": 1.0777, "rewards/accuracies": 0.0, "rewards/chosen": 2.294240951538086, "rewards/margins": -2.0150623321533203, "rewards/rejected": 4.309303283691406, "step": 4323 }, { "epoch": 0.96, "learning_rate": 5.590089789426295e-06, "logits/chosen": -1.559899091720581, "logits/rejected": -1.5354558229446411, "logps/chosen": -21.175344467163086, "logps/rejected": -34.431339263916016, "loss": 1.1247, "rewards/accuracies": 0.0, "rewards/chosen": 1.452953577041626, "rewards/margins": -1.601384162902832, "rewards/rejected": 3.054337739944458, "step": 4324 }, { "epoch": 0.96, "learning_rate": 5.588309949229447e-06, "logits/chosen": -1.7527639865875244, "logits/rejected": -1.7570793628692627, "logps/chosen": -43.76459503173828, "logps/rejected": -105.63653564453125, "loss": 1.343, "rewards/accuracies": 0.0, "rewards/chosen": 3.167966604232788, "rewards/margins": -2.4810783863067627, "rewards/rejected": 5.649044990539551, "step": 4325 }, { "epoch": 0.96, "learning_rate": 5.586530033436184e-06, "logits/chosen": -1.5970925092697144, "logits/rejected": -1.4977885484695435, "logps/chosen": -154.97869873046875, "logps/rejected": -41.61357879638672, "loss": 0.8606, "rewards/accuracies": 0.0, "rewards/chosen": 4.25880765914917, "rewards/margins": -1.3848257064819336, "rewards/rejected": 5.6436333656311035, "step": 4326 }, { "epoch": 0.96, "learning_rate": 5.584750042275222e-06, "logits/chosen": -1.42423677444458, "logits/rejected": -1.4653950929641724, "logps/chosen": -48.95261001586914, "logps/rejected": -51.44613265991211, "loss": 1.0543, "rewards/accuracies": 0.0, "rewards/chosen": 2.2231030464172363, "rewards/margins": -1.7653677463531494, "rewards/rejected": 3.9884707927703857, "step": 4327 }, { "epoch": 0.96, "learning_rate": 5.582969975975285e-06, "logits/chosen": -1.9370182752609253, "logits/rejected": -1.8792368173599243, "logps/chosen": -161.5343017578125, "logps/rejected": -114.30831909179688, "loss": 1.1494, "rewards/accuracies": 0.0, "rewards/chosen": 6.7947235107421875, "rewards/margins": -2.1910505294799805, "rewards/rejected": 8.985774040222168, "step": 4328 }, { "epoch": 0.96, "learning_rate": 5.581189834765107e-06, "logits/chosen": -1.7761211395263672, "logits/rejected": -1.7720260620117188, "logps/chosen": -58.98405456542969, "logps/rejected": -60.83140563964844, "loss": 1.4511, "rewards/accuracies": 0.0, "rewards/chosen": 3.338090658187866, "rewards/margins": -1.7516400814056396, "rewards/rejected": 5.089730739593506, "step": 4329 }, { "epoch": 0.96, "learning_rate": 5.5794096188734335e-06, "logits/chosen": -1.6841747760772705, "logits/rejected": -1.4704103469848633, "logps/chosen": -117.4083023071289, "logps/rejected": -53.55259704589844, "loss": 0.2193, "rewards/accuracies": 1.0, "rewards/chosen": 7.4841485023498535, "rewards/margins": 5.221197128295898, "rewards/rejected": 2.262951612472534, "step": 4330 }, { "epoch": 0.96, "learning_rate": 5.577629328529017e-06, "logits/chosen": -1.7491978406906128, "logits/rejected": -1.7081440687179565, "logps/chosen": -102.14273071289062, "logps/rejected": -56.40504455566406, "loss": 0.8697, "rewards/accuracies": 0.0, "rewards/chosen": 3.302677869796753, "rewards/margins": -1.544987440109253, "rewards/rejected": 4.847665309906006, "step": 4331 }, { "epoch": 0.96, "learning_rate": 5.575848963960621e-06, "logits/chosen": -1.6923109292984009, "logits/rejected": -1.6646374464035034, "logps/chosen": -40.21116638183594, "logps/rejected": -34.04145050048828, "loss": 1.4867, "rewards/accuracies": 1.0, "rewards/chosen": 2.725054979324341, "rewards/margins": 0.9215401411056519, "rewards/rejected": 1.803514838218689, "step": 4332 }, { "epoch": 0.96, "learning_rate": 5.574068525397018e-06, "logits/chosen": -1.8040874004364014, "logits/rejected": -1.783510446548462, "logps/chosen": -142.03176879882812, "logps/rejected": -114.91000366210938, "loss": 2.0816, "rewards/accuracies": 0.0, "rewards/chosen": 6.5410614013671875, "rewards/margins": -4.140625, "rewards/rejected": 10.681686401367188, "step": 4333 }, { "epoch": 0.96, "learning_rate": 5.572288013066986e-06, "logits/chosen": -1.6727818250656128, "logits/rejected": -1.6916568279266357, "logps/chosen": -55.45652770996094, "logps/rejected": -59.474334716796875, "loss": 0.5333, "rewards/accuracies": 1.0, "rewards/chosen": 3.1082725524902344, "rewards/margins": 0.12421727180480957, "rewards/rejected": 2.984055280685425, "step": 4334 }, { "epoch": 0.96, "learning_rate": 5.570507427199323e-06, "logits/chosen": -1.6485648155212402, "logits/rejected": -1.6402547359466553, "logps/chosen": -57.19480514526367, "logps/rejected": -63.41582107543945, "loss": 0.7931, "rewards/accuracies": 0.0, "rewards/chosen": 2.4329967498779297, "rewards/margins": -1.3409507274627686, "rewards/rejected": 3.7739474773406982, "step": 4335 }, { "epoch": 0.96, "learning_rate": 5.568726768022825e-06, "logits/chosen": -2.044081687927246, "logits/rejected": -2.0219810009002686, "logps/chosen": -51.83013153076172, "logps/rejected": -68.85613250732422, "loss": 0.7356, "rewards/accuracies": 1.0, "rewards/chosen": 3.158329725265503, "rewards/margins": 0.3979461193084717, "rewards/rejected": 2.7603836059570312, "step": 4336 }, { "epoch": 0.96, "learning_rate": 5.566946035766307e-06, "logits/chosen": -1.8225253820419312, "logits/rejected": -1.7064704895019531, "logps/chosen": -102.39518737792969, "logps/rejected": -31.609270095825195, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 8.13658618927002, "rewards/margins": 5.671090126037598, "rewards/rejected": 2.465496301651001, "step": 4337 }, { "epoch": 0.96, "learning_rate": 5.565165230658583e-06, "logits/chosen": -1.5809555053710938, "logits/rejected": -1.5447266101837158, "logps/chosen": -60.174522399902344, "logps/rejected": -36.685516357421875, "loss": 1.1184, "rewards/accuracies": 0.0, "rewards/chosen": 2.8003578186035156, "rewards/margins": -0.7077934741973877, "rewards/rejected": 3.5081512928009033, "step": 4338 }, { "epoch": 0.96, "learning_rate": 5.563384352928488e-06, "logits/chosen": -1.4522440433502197, "logits/rejected": -1.3402485847473145, "logps/chosen": -41.4510612487793, "logps/rejected": -51.647178649902344, "loss": 0.3591, "rewards/accuracies": 0.0, "rewards/chosen": 2.610668659210205, "rewards/margins": -0.03786110877990723, "rewards/rejected": 2.6485297679901123, "step": 4339 }, { "epoch": 0.96, "learning_rate": 5.561603402804858e-06, "logits/chosen": -1.5952281951904297, "logits/rejected": -1.5431218147277832, "logps/chosen": -64.20468139648438, "logps/rejected": -68.84400939941406, "loss": 0.7304, "rewards/accuracies": 1.0, "rewards/chosen": 2.205226182937622, "rewards/margins": 0.07973480224609375, "rewards/rejected": 2.1254913806915283, "step": 4340 }, { "epoch": 0.96, "learning_rate": 5.559822380516539e-06, "logits/chosen": -1.7710604667663574, "logits/rejected": -1.8168885707855225, "logps/chosen": -237.7344970703125, "logps/rejected": -181.24034118652344, "loss": 0.5191, "rewards/accuracies": 0.0, "rewards/chosen": 9.893365859985352, "rewards/margins": -0.597559928894043, "rewards/rejected": 10.490925788879395, "step": 4341 }, { "epoch": 0.96, "learning_rate": 5.5580412862923916e-06, "logits/chosen": -1.6015031337738037, "logits/rejected": -1.5360281467437744, "logps/chosen": -31.261343002319336, "logps/rejected": -9.163603782653809, "loss": 0.133, "rewards/accuracies": 1.0, "rewards/chosen": 2.7954423427581787, "rewards/margins": 1.9737420082092285, "rewards/rejected": 0.821700394153595, "step": 4342 }, { "epoch": 0.96, "learning_rate": 5.55626012036128e-06, "logits/chosen": -1.5845637321472168, "logits/rejected": -1.5473418235778809, "logps/chosen": -64.64556121826172, "logps/rejected": -54.81694030761719, "loss": 1.1531, "rewards/accuracies": 0.0, "rewards/chosen": 2.450223684310913, "rewards/margins": -1.3906888961791992, "rewards/rejected": 3.8409125804901123, "step": 4343 }, { "epoch": 0.96, "learning_rate": 5.554478882952081e-06, "logits/chosen": -1.6542752981185913, "logits/rejected": -1.6743988990783691, "logps/chosen": -63.951786041259766, "logps/rejected": -56.138484954833984, "loss": 1.0703, "rewards/accuracies": 0.0, "rewards/chosen": 2.3093533515930176, "rewards/margins": -2.015374183654785, "rewards/rejected": 4.324727535247803, "step": 4344 }, { "epoch": 0.96, "learning_rate": 5.5526975742936774e-06, "logits/chosen": -1.9230989217758179, "logits/rejected": -1.9104578495025635, "logps/chosen": -72.25039672851562, "logps/rejected": -93.50775909423828, "loss": 2.0296, "rewards/accuracies": 0.0, "rewards/chosen": 2.418339490890503, "rewards/margins": -3.9989449977874756, "rewards/rejected": 6.4172844886779785, "step": 4345 }, { "epoch": 0.96, "learning_rate": 5.5509161946149635e-06, "logits/chosen": -1.6020634174346924, "logits/rejected": -1.6664034128189087, "logps/chosen": -71.52499389648438, "logps/rejected": -144.4613037109375, "loss": 0.1509, "rewards/accuracies": 1.0, "rewards/chosen": 6.146636962890625, "rewards/margins": 1.0809369087219238, "rewards/rejected": 5.065700054168701, "step": 4346 }, { "epoch": 0.96, "learning_rate": 5.549134744144844e-06, "logits/chosen": -1.6199475526809692, "logits/rejected": -1.4309203624725342, "logps/chosen": -177.11187744140625, "logps/rejected": -13.388458251953125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 7.843618869781494, "rewards/margins": 6.804028511047363, "rewards/rejected": 1.0395904779434204, "step": 4347 }, { "epoch": 0.96, "learning_rate": 5.547353223112231e-06, "logits/chosen": -1.7234156131744385, "logits/rejected": -1.611135482788086, "logps/chosen": -151.14971923828125, "logps/rejected": -65.5414810180664, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 7.173478603363037, "rewards/margins": 4.833727836608887, "rewards/rejected": 2.3397507667541504, "step": 4348 }, { "epoch": 0.96, "learning_rate": 5.545571631746043e-06, "logits/chosen": -1.7050652503967285, "logits/rejected": -1.7050652503967285, "logps/chosen": -89.31945037841797, "logps/rejected": -89.31945037841797, "loss": 0.8416, "rewards/accuracies": 0.0, "rewards/chosen": 7.37701940536499, "rewards/margins": 0.0, "rewards/rejected": 7.37701940536499, "step": 4349 }, { "epoch": 0.96, "learning_rate": 5.54378997027521e-06, "logits/chosen": -1.4325075149536133, "logits/rejected": -1.4060726165771484, "logps/chosen": -114.32239532470703, "logps/rejected": -58.92121124267578, "loss": 0.5764, "rewards/accuracies": 1.0, "rewards/chosen": 5.604828834533691, "rewards/margins": 1.635986566543579, "rewards/rejected": 3.9688422679901123, "step": 4350 }, { "epoch": 0.96, "learning_rate": 5.542008238928676e-06, "logits/chosen": -1.66858971118927, "logits/rejected": -1.5945113897323608, "logps/chosen": -64.20436096191406, "logps/rejected": -133.45626831054688, "loss": 1.7965, "rewards/accuracies": 0.0, "rewards/chosen": 4.447337627410889, "rewards/margins": -3.5360517501831055, "rewards/rejected": 7.983389377593994, "step": 4351 }, { "epoch": 0.96, "learning_rate": 5.540226437935385e-06, "logits/chosen": -1.7014015913009644, "logits/rejected": -1.6801104545593262, "logps/chosen": -38.98095703125, "logps/rejected": -53.92830276489258, "loss": 0.2671, "rewards/accuracies": 1.0, "rewards/chosen": 3.739398241043091, "rewards/margins": 0.9168224334716797, "rewards/rejected": 2.822575807571411, "step": 4352 }, { "epoch": 0.96, "learning_rate": 5.538444567524296e-06, "logits/chosen": -1.832017421722412, "logits/rejected": -1.9055125713348389, "logps/chosen": -50.84355163574219, "logps/rejected": -108.05844116210938, "loss": 2.674, "rewards/accuracies": 0.0, "rewards/chosen": 3.047119140625, "rewards/margins": -5.232553482055664, "rewards/rejected": 8.279672622680664, "step": 4353 }, { "epoch": 0.96, "learning_rate": 5.5366626279243754e-06, "logits/chosen": -1.448838710784912, "logits/rejected": -1.3232964277267456, "logps/chosen": -45.35999298095703, "logps/rejected": -13.898276329040527, "loss": 0.2541, "rewards/accuracies": 1.0, "rewards/chosen": 3.272982120513916, "rewards/margins": 2.499101161956787, "rewards/rejected": 0.7738808989524841, "step": 4354 }, { "epoch": 0.96, "learning_rate": 5.534880619364595e-06, "logits/chosen": -1.7823160886764526, "logits/rejected": -1.7710262537002563, "logps/chosen": -72.28221130371094, "logps/rejected": -32.2525520324707, "loss": 0.5805, "rewards/accuracies": 0.0, "rewards/chosen": 2.157335042953491, "rewards/margins": -0.7835617065429688, "rewards/rejected": 2.94089674949646, "step": 4355 }, { "epoch": 0.96, "learning_rate": 5.533098542073942e-06, "logits/chosen": -1.4990311861038208, "logits/rejected": -1.4161837100982666, "logps/chosen": -147.23023986816406, "logps/rejected": -39.85430145263672, "loss": 0.203, "rewards/accuracies": 1.0, "rewards/chosen": 5.6786651611328125, "rewards/margins": 2.631305694580078, "rewards/rejected": 3.0473594665527344, "step": 4356 }, { "epoch": 0.96, "learning_rate": 5.5313163962814085e-06, "logits/chosen": -1.623095154762268, "logits/rejected": -1.5689903497695923, "logps/chosen": -51.505882263183594, "logps/rejected": -41.74958038330078, "loss": 0.5566, "rewards/accuracies": 0.0, "rewards/chosen": 3.5509660243988037, "rewards/margins": -0.7113516330718994, "rewards/rejected": 4.262317657470703, "step": 4357 }, { "epoch": 0.96, "learning_rate": 5.529534182215995e-06, "logits/chosen": -2.0318868160247803, "logits/rejected": -1.9218544960021973, "logps/chosen": -169.961181640625, "logps/rejected": -23.363758087158203, "loss": 0.3541, "rewards/accuracies": 1.0, "rewards/chosen": 1.2398407459259033, "rewards/margins": 0.0903770923614502, "rewards/rejected": 1.1494636535644531, "step": 4358 }, { "epoch": 0.96, "learning_rate": 5.527751900106711e-06, "logits/chosen": -1.3786158561706543, "logits/rejected": -1.2881226539611816, "logps/chosen": -100.5750503540039, "logps/rejected": -81.46385192871094, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 6.157247066497803, "rewards/margins": 2.55873703956604, "rewards/rejected": 3.5985100269317627, "step": 4359 }, { "epoch": 0.97, "learning_rate": 5.525969550182577e-06, "logits/chosen": -1.8689172267913818, "logits/rejected": -1.6794521808624268, "logps/chosen": -118.65290069580078, "logps/rejected": -32.23021697998047, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 9.737516403198242, "rewards/margins": 6.301470756530762, "rewards/rejected": 3.4360458850860596, "step": 4360 }, { "epoch": 0.97, "learning_rate": 5.52418713267262e-06, "logits/chosen": -1.6809213161468506, "logits/rejected": -1.683246374130249, "logps/chosen": -52.74074172973633, "logps/rejected": -114.4659194946289, "loss": 1.7768, "rewards/accuracies": 0.0, "rewards/chosen": 3.318882465362549, "rewards/margins": -3.037046432495117, "rewards/rejected": 6.355928897857666, "step": 4361 }, { "epoch": 0.97, "learning_rate": 5.522404647805877e-06, "logits/chosen": -1.673061728477478, "logits/rejected": -1.739841341972351, "logps/chosen": -77.87823486328125, "logps/rejected": -97.7506332397461, "loss": 1.4343, "rewards/accuracies": 0.0, "rewards/chosen": 5.401123046875, "rewards/margins": -2.78994083404541, "rewards/rejected": 8.19106388092041, "step": 4362 }, { "epoch": 0.97, "learning_rate": 5.520622095811392e-06, "logits/chosen": -1.9591190814971924, "logits/rejected": -1.9426530599594116, "logps/chosen": -85.90380096435547, "logps/rejected": -36.26892852783203, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 3.628685712814331, "rewards/margins": 2.4016876220703125, "rewards/rejected": 1.226998209953308, "step": 4363 }, { "epoch": 0.97, "learning_rate": 5.518839476918221e-06, "logits/chosen": -2.1180317401885986, "logits/rejected": -2.109386444091797, "logps/chosen": -127.99783325195312, "logps/rejected": -119.77608489990234, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 8.1488037109375, "rewards/margins": -1.0620641708374023, "rewards/rejected": 9.210867881774902, "step": 4364 }, { "epoch": 0.97, "learning_rate": 5.517056791355423e-06, "logits/chosen": -1.5512181520462036, "logits/rejected": -1.4530103206634521, "logps/chosen": -103.19923400878906, "logps/rejected": -50.54771423339844, "loss": 0.1547, "rewards/accuracies": 1.0, "rewards/chosen": 5.742964267730713, "rewards/margins": 3.798673152923584, "rewards/rejected": 1.9442909955978394, "step": 4365 }, { "epoch": 0.97, "learning_rate": 5.515274039352072e-06, "logits/chosen": -1.6031436920166016, "logits/rejected": -1.591531753540039, "logps/chosen": -71.74554443359375, "logps/rejected": -67.89228820800781, "loss": 0.3676, "rewards/accuracies": 1.0, "rewards/chosen": 5.822396755218506, "rewards/margins": 3.1184003353118896, "rewards/rejected": 2.703996419906616, "step": 4366 }, { "epoch": 0.97, "learning_rate": 5.513491221137244e-06, "logits/chosen": -1.7161239385604858, "logits/rejected": -1.7594317197799683, "logps/chosen": -60.416656494140625, "logps/rejected": -91.22238159179688, "loss": 1.3794, "rewards/accuracies": 0.0, "rewards/chosen": 2.8968658447265625, "rewards/margins": -2.5447630882263184, "rewards/rejected": 5.441628932952881, "step": 4367 }, { "epoch": 0.97, "learning_rate": 5.5117083369400285e-06, "logits/chosen": -1.6214696168899536, "logits/rejected": -1.575587511062622, "logps/chosen": -23.58197021484375, "logps/rejected": -3.7714433670043945, "loss": 0.3077, "rewards/accuracies": 1.0, "rewards/chosen": 1.4348499774932861, "rewards/margins": 0.760318398475647, "rewards/rejected": 0.6745315790176392, "step": 4368 }, { "epoch": 0.97, "learning_rate": 5.509925386989521e-06, "logits/chosen": -1.8516640663146973, "logits/rejected": -1.8669283390045166, "logps/chosen": -31.03614616394043, "logps/rejected": -36.64331817626953, "loss": 1.7963, "rewards/accuracies": 0.0, "rewards/chosen": 1.3921831846237183, "rewards/margins": -3.5023112297058105, "rewards/rejected": 4.894494533538818, "step": 4369 }, { "epoch": 0.97, "learning_rate": 5.508142371514828e-06, "logits/chosen": -1.4836788177490234, "logits/rejected": -1.4836788177490234, "logps/chosen": -42.10923385620117, "logps/rejected": -42.10923385620117, "loss": 0.5112, "rewards/accuracies": 0.0, "rewards/chosen": 2.878412961959839, "rewards/margins": 0.0, "rewards/rejected": 2.878412961959839, "step": 4370 }, { "epoch": 0.97, "learning_rate": 5.506359290745061e-06, "logits/chosen": -1.5887218713760376, "logits/rejected": -1.6070239543914795, "logps/chosen": -76.24546813964844, "logps/rejected": -65.66451263427734, "loss": 2.3905, "rewards/accuracies": 0.0, "rewards/chosen": 1.747766137123108, "rewards/margins": -2.8952088356018066, "rewards/rejected": 4.642974853515625, "step": 4371 }, { "epoch": 0.97, "learning_rate": 5.504576144909344e-06, "logits/chosen": -1.6021682024002075, "logits/rejected": -1.5466927289962769, "logps/chosen": -36.61882019042969, "logps/rejected": -13.46969223022461, "loss": 1.8563, "rewards/accuracies": 1.0, "rewards/chosen": 2.737083911895752, "rewards/margins": 1.6977074146270752, "rewards/rejected": 1.0393764972686768, "step": 4372 }, { "epoch": 0.97, "learning_rate": 5.502792934236805e-06, "logits/chosen": -1.6617991924285889, "logits/rejected": -1.5856237411499023, "logps/chosen": -93.2987289428711, "logps/rejected": -16.257497787475586, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": 6.0437750816345215, "rewards/margins": 2.989527940750122, "rewards/rejected": 3.0542471408843994, "step": 4373 }, { "epoch": 0.97, "learning_rate": 5.501009658956583e-06, "logits/chosen": -1.9402453899383545, "logits/rejected": -1.933812141418457, "logps/chosen": -49.32327651977539, "logps/rejected": -52.7122802734375, "loss": 3.728, "rewards/accuracies": 1.0, "rewards/chosen": 2.484748601913452, "rewards/margins": 1.435559630393982, "rewards/rejected": 1.0491889715194702, "step": 4374 }, { "epoch": 0.97, "learning_rate": 5.499226319297824e-06, "logits/chosen": -1.8882501125335693, "logits/rejected": -1.504470705986023, "logps/chosen": -101.03700256347656, "logps/rejected": -154.1918487548828, "loss": 2.2704, "rewards/accuracies": 0.0, "rewards/chosen": 2.053647756576538, "rewards/margins": -3.48237681388855, "rewards/rejected": 5.536024570465088, "step": 4375 }, { "epoch": 0.97, "learning_rate": 5.497442915489682e-06, "logits/chosen": -1.7126598358154297, "logits/rejected": -1.69575035572052, "logps/chosen": -76.90003967285156, "logps/rejected": -48.848472595214844, "loss": 0.5872, "rewards/accuracies": 0.0, "rewards/chosen": 2.2713356018066406, "rewards/margins": -0.7861084938049316, "rewards/rejected": 3.0574440956115723, "step": 4376 }, { "epoch": 0.97, "learning_rate": 5.495659447761321e-06, "logits/chosen": -1.539404034614563, "logits/rejected": -1.4425849914550781, "logps/chosen": -47.5732536315918, "logps/rejected": -27.79573631286621, "loss": 1.0337, "rewards/accuracies": 1.0, "rewards/chosen": 2.2364842891693115, "rewards/margins": 1.9755017757415771, "rewards/rejected": 0.2609825134277344, "step": 4377 }, { "epoch": 0.97, "learning_rate": 5.493875916341913e-06, "logits/chosen": -1.9687840938568115, "logits/rejected": -1.4873145818710327, "logps/chosen": -110.08998107910156, "logps/rejected": -99.87334442138672, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 7.491878032684326, "rewards/margins": 3.0160350799560547, "rewards/rejected": 4.4758429527282715, "step": 4378 }, { "epoch": 0.97, "learning_rate": 5.492092321460637e-06, "logits/chosen": -1.1999735832214355, "logits/rejected": -1.1999735832214355, "logps/chosen": -42.958740234375, "logps/rejected": -42.958740234375, "loss": 0.3934, "rewards/accuracies": 0.0, "rewards/chosen": 2.366326093673706, "rewards/margins": 0.0, "rewards/rejected": 2.366326093673706, "step": 4379 }, { "epoch": 0.97, "learning_rate": 5.490308663346681e-06, "logits/chosen": -1.6365327835083008, "logits/rejected": -1.5991318225860596, "logps/chosen": -62.304100036621094, "logps/rejected": -28.477985382080078, "loss": 1.9272, "rewards/accuracies": 1.0, "rewards/chosen": 2.1886603832244873, "rewards/margins": 1.6465494632720947, "rewards/rejected": 0.5421108603477478, "step": 4380 }, { "epoch": 0.97, "learning_rate": 5.488524942229241e-06, "logits/chosen": -1.6860648393630981, "logits/rejected": -1.6046981811523438, "logps/chosen": -68.13186645507812, "logps/rejected": -51.129974365234375, "loss": 0.1924, "rewards/accuracies": 1.0, "rewards/chosen": 3.9982049465179443, "rewards/margins": 1.5983331203460693, "rewards/rejected": 2.399871826171875, "step": 4381 }, { "epoch": 0.97, "learning_rate": 5.48674115833752e-06, "logits/chosen": -1.509800910949707, "logits/rejected": -1.436567783355713, "logps/chosen": -50.30571746826172, "logps/rejected": -46.31690216064453, "loss": 1.6777, "rewards/accuracies": 0.0, "rewards/chosen": 2.7918593883514404, "rewards/margins": -1.9177577495574951, "rewards/rejected": 4.7096171379089355, "step": 4382 }, { "epoch": 0.97, "learning_rate": 5.484957311900728e-06, "logits/chosen": -1.874718427658081, "logits/rejected": -1.8596477508544922, "logps/chosen": -86.1983871459961, "logps/rejected": -49.87327575683594, "loss": 0.495, "rewards/accuracies": 0.0, "rewards/chosen": 3.1153481006622314, "rewards/margins": -0.28166818618774414, "rewards/rejected": 3.3970162868499756, "step": 4383 }, { "epoch": 0.97, "learning_rate": 5.483173403148091e-06, "logits/chosen": -1.7683939933776855, "logits/rejected": -1.7315986156463623, "logps/chosen": -58.65298843383789, "logps/rejected": -71.29547119140625, "loss": 1.3159, "rewards/accuracies": 0.0, "rewards/chosen": 3.4766597747802734, "rewards/margins": -2.1064939498901367, "rewards/rejected": 5.58315372467041, "step": 4384 }, { "epoch": 0.97, "learning_rate": 5.48138943230883e-06, "logits/chosen": -1.5038886070251465, "logits/rejected": -1.501518726348877, "logps/chosen": -46.32723617553711, "logps/rejected": -67.78541564941406, "loss": 0.1677, "rewards/accuracies": 1.0, "rewards/chosen": 3.4728152751922607, "rewards/margins": 0.9219813346862793, "rewards/rejected": 2.5508339405059814, "step": 4385 }, { "epoch": 0.97, "learning_rate": 5.479605399612185e-06, "logits/chosen": -1.516205906867981, "logits/rejected": -1.5845317840576172, "logps/chosen": -132.96499633789062, "logps/rejected": -109.59341430664062, "loss": 0.3842, "rewards/accuracies": 1.0, "rewards/chosen": 6.265257358551025, "rewards/margins": 0.5903215408325195, "rewards/rejected": 5.674935817718506, "step": 4386 }, { "epoch": 0.97, "learning_rate": 5.477821305287401e-06, "logits/chosen": -1.7359588146209717, "logits/rejected": -1.7199385166168213, "logps/chosen": -104.4842529296875, "logps/rejected": -75.03486633300781, "loss": 0.4048, "rewards/accuracies": 0.0, "rewards/chosen": 4.799353122711182, "rewards/margins": -0.13971233367919922, "rewards/rejected": 4.939065456390381, "step": 4387 }, { "epoch": 0.97, "learning_rate": 5.4760371495637256e-06, "logits/chosen": -1.6055134534835815, "logits/rejected": -1.481235384941101, "logps/chosen": -64.91722106933594, "logps/rejected": -14.933747291564941, "loss": 0.4966, "rewards/accuracies": 1.0, "rewards/chosen": 3.942333936691284, "rewards/margins": 3.6793980598449707, "rewards/rejected": 0.26293593645095825, "step": 4388 }, { "epoch": 0.97, "learning_rate": 5.474252932670421e-06, "logits/chosen": -1.6300766468048096, "logits/rejected": -1.6552226543426514, "logps/chosen": -67.40277099609375, "logps/rejected": -55.579811096191406, "loss": 0.4851, "rewards/accuracies": 0.0, "rewards/chosen": 2.629956007003784, "rewards/margins": -0.4659898281097412, "rewards/rejected": 3.0959458351135254, "step": 4389 }, { "epoch": 0.97, "learning_rate": 5.4724686548367564e-06, "logits/chosen": -1.6701455116271973, "logits/rejected": -1.5874474048614502, "logps/chosen": -96.81588745117188, "logps/rejected": -70.7012939453125, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 6.765618801116943, "rewards/margins": 3.745615243911743, "rewards/rejected": 3.0200035572052, "step": 4390 }, { "epoch": 0.97, "learning_rate": 5.470684316292004e-06, "logits/chosen": -2.16518497467041, "logits/rejected": -2.135962724685669, "logps/chosen": -40.73710632324219, "logps/rejected": -19.374347686767578, "loss": 1.1038, "rewards/accuracies": 1.0, "rewards/chosen": 2.4348862171173096, "rewards/margins": 1.6008687019348145, "rewards/rejected": 0.8340175747871399, "step": 4391 }, { "epoch": 0.97, "learning_rate": 5.468899917265449e-06, "logits/chosen": -1.644189715385437, "logits/rejected": -1.627461314201355, "logps/chosen": -44.352455139160156, "logps/rejected": -56.866172790527344, "loss": 0.6462, "rewards/accuracies": 0.0, "rewards/chosen": 2.724740743637085, "rewards/margins": -0.9442038536071777, "rewards/rejected": 3.6689445972442627, "step": 4392 }, { "epoch": 0.97, "learning_rate": 5.46711545798638e-06, "logits/chosen": -1.5200861692428589, "logits/rejected": -1.4940773248672485, "logps/chosen": -56.582733154296875, "logps/rejected": -46.79045104980469, "loss": 1.4735, "rewards/accuracies": 0.0, "rewards/chosen": 3.0937423706054688, "rewards/margins": -2.8594865798950195, "rewards/rejected": 5.953228950500488, "step": 4393 }, { "epoch": 0.97, "learning_rate": 5.4653309386841004e-06, "logits/chosen": -1.9338593482971191, "logits/rejected": -1.9592946767807007, "logps/chosen": -31.54705238342285, "logps/rejected": -53.83147430419922, "loss": 1.8816, "rewards/accuracies": 0.0, "rewards/chosen": 3.268328905105591, "rewards/margins": -1.4936907291412354, "rewards/rejected": 4.762019634246826, "step": 4394 }, { "epoch": 0.97, "learning_rate": 5.463546359587913e-06, "logits/chosen": -1.72502863407135, "logits/rejected": -1.715561032295227, "logps/chosen": -105.77584838867188, "logps/rejected": -178.3192138671875, "loss": 1.0164, "rewards/accuracies": 0.0, "rewards/chosen": 6.463726997375488, "rewards/margins": -1.880411148071289, "rewards/rejected": 8.344138145446777, "step": 4395 }, { "epoch": 0.97, "learning_rate": 5.461761720927133e-06, "logits/chosen": -1.8995195627212524, "logits/rejected": -1.935910701751709, "logps/chosen": -106.18102264404297, "logps/rejected": -233.226806640625, "loss": 0.4102, "rewards/accuracies": 1.0, "rewards/chosen": 7.4045939445495605, "rewards/margins": 0.10433769226074219, "rewards/rejected": 7.300256252288818, "step": 4396 }, { "epoch": 0.97, "learning_rate": 5.45997702293108e-06, "logits/chosen": -1.5251914262771606, "logits/rejected": -1.5031651258468628, "logps/chosen": -91.42202758789062, "logps/rejected": -101.23041534423828, "loss": 0.6691, "rewards/accuracies": 0.0, "rewards/chosen": 5.788183689117432, "rewards/margins": -1.0187430381774902, "rewards/rejected": 6.806926727294922, "step": 4397 }, { "epoch": 0.97, "learning_rate": 5.458192265829087e-06, "logits/chosen": -1.4741597175598145, "logits/rejected": -1.4847761392593384, "logps/chosen": -108.39775848388672, "logps/rejected": -109.12240600585938, "loss": 0.8299, "rewards/accuracies": 0.0, "rewards/chosen": 5.51848840713501, "rewards/margins": -1.408205509185791, "rewards/rejected": 6.926693916320801, "step": 4398 }, { "epoch": 0.97, "learning_rate": 5.456407449850489e-06, "logits/chosen": -1.6820077896118164, "logits/rejected": -1.5351965427398682, "logps/chosen": -150.71255493164062, "logps/rejected": -143.62469482421875, "loss": 0.1473, "rewards/accuracies": 1.0, "rewards/chosen": 7.775353908538818, "rewards/margins": 1.112722396850586, "rewards/rejected": 6.662631511688232, "step": 4399 }, { "epoch": 0.97, "learning_rate": 5.45462257522463e-06, "logits/chosen": -1.8877125978469849, "logits/rejected": -1.9004428386688232, "logps/chosen": -45.68787384033203, "logps/rejected": -45.17123031616211, "loss": 0.5294, "rewards/accuracies": 0.0, "rewards/chosen": 2.804733991622925, "rewards/margins": -0.6153347492218018, "rewards/rejected": 3.4200687408447266, "step": 4400 }, { "epoch": 0.97, "learning_rate": 5.452837642180864e-06, "logits/chosen": -1.4585301876068115, "logits/rejected": -1.3747379779815674, "logps/chosen": -101.84857177734375, "logps/rejected": -85.4577865600586, "loss": 1.3693, "rewards/accuracies": 1.0, "rewards/chosen": 6.670539855957031, "rewards/margins": 3.1146528720855713, "rewards/rejected": 3.55588698387146, "step": 4401 }, { "epoch": 0.97, "learning_rate": 5.451052650948549e-06, "logits/chosen": -1.4386115074157715, "logits/rejected": -1.3887699842453003, "logps/chosen": -46.81174850463867, "logps/rejected": -37.68821716308594, "loss": 1.6491, "rewards/accuracies": 1.0, "rewards/chosen": 2.868699312210083, "rewards/margins": 0.7837214469909668, "rewards/rejected": 2.084977865219116, "step": 4402 }, { "epoch": 0.97, "learning_rate": 5.449267601757054e-06, "logits/chosen": -1.474578619003296, "logits/rejected": -1.474578619003296, "logps/chosen": -62.418121337890625, "logps/rejected": -62.418121337890625, "loss": 0.3482, "rewards/accuracies": 0.0, "rewards/chosen": 5.215606689453125, "rewards/margins": 0.0, "rewards/rejected": 5.215606689453125, "step": 4403 }, { "epoch": 0.97, "learning_rate": 5.44748249483575e-06, "logits/chosen": -1.3255397081375122, "logits/rejected": -1.3255397081375122, "logps/chosen": -37.32157516479492, "logps/rejected": -37.32157516479492, "loss": 1.3025, "rewards/accuracies": 0.0, "rewards/chosen": 6.225824356079102, "rewards/margins": 0.0, "rewards/rejected": 6.225824356079102, "step": 4404 }, { "epoch": 0.97, "learning_rate": 5.4456973304140226e-06, "logits/chosen": -1.6034942865371704, "logits/rejected": -1.623482584953308, "logps/chosen": -68.37025451660156, "logps/rejected": -99.369140625, "loss": 0.5802, "rewards/accuracies": 0.0, "rewards/chosen": 3.1318893432617188, "rewards/margins": -0.6905670166015625, "rewards/rejected": 3.8224563598632812, "step": 4405 }, { "epoch": 0.98, "learning_rate": 5.443912108721259e-06, "logits/chosen": -1.4314677715301514, "logits/rejected": -1.4314677715301514, "logps/chosen": -65.41058349609375, "logps/rejected": -65.41058349609375, "loss": 4.1831, "rewards/accuracies": 0.0, "rewards/chosen": 3.904989004135132, "rewards/margins": 0.0, "rewards/rejected": 3.904989004135132, "step": 4406 }, { "epoch": 0.98, "learning_rate": 5.4421268299868575e-06, "logits/chosen": -1.4419230222702026, "logits/rejected": -1.6424994468688965, "logps/chosen": -73.37118530273438, "logps/rejected": -70.19975280761719, "loss": 4.4984, "rewards/accuracies": 0.0, "rewards/chosen": 3.790771484375, "rewards/margins": -8.603632926940918, "rewards/rejected": 12.394404411315918, "step": 4407 }, { "epoch": 0.98, "learning_rate": 5.440341494440221e-06, "logits/chosen": -1.8030136823654175, "logits/rejected": -1.8030136823654175, "logps/chosen": -73.04559326171875, "logps/rejected": -73.04559326171875, "loss": 1.3715, "rewards/accuracies": 0.0, "rewards/chosen": 2.7523467540740967, "rewards/margins": 0.0, "rewards/rejected": 2.7523467540740967, "step": 4408 }, { "epoch": 0.98, "learning_rate": 5.43855610231076e-06, "logits/chosen": -1.817948818206787, "logits/rejected": -1.8215566873550415, "logps/chosen": -36.448883056640625, "logps/rejected": -79.6075668334961, "loss": 0.8698, "rewards/accuracies": 0.0, "rewards/chosen": 2.739966630935669, "rewards/margins": -0.31795573234558105, "rewards/rejected": 3.05792236328125, "step": 4409 }, { "epoch": 0.98, "learning_rate": 5.436770653827894e-06, "logits/chosen": -1.843878984451294, "logits/rejected": -1.769130825996399, "logps/chosen": -120.76065826416016, "logps/rejected": -74.05908966064453, "loss": 0.2063, "rewards/accuracies": 1.0, "rewards/chosen": 6.274288177490234, "rewards/margins": 0.6819000244140625, "rewards/rejected": 5.592388153076172, "step": 4410 }, { "epoch": 0.98, "learning_rate": 5.43498514922105e-06, "logits/chosen": -1.7681270837783813, "logits/rejected": -1.5537663698196411, "logps/chosen": -167.69354248046875, "logps/rejected": -199.7219696044922, "loss": 1.7086, "rewards/accuracies": 0.0, "rewards/chosen": 7.150305271148682, "rewards/margins": -3.3324971199035645, "rewards/rejected": 10.482802391052246, "step": 4411 }, { "epoch": 0.98, "learning_rate": 5.43319958871966e-06, "logits/chosen": -1.4390511512756348, "logits/rejected": -1.4070534706115723, "logps/chosen": -42.96583557128906, "logps/rejected": -48.47906494140625, "loss": 1.2714, "rewards/accuracies": 0.0, "rewards/chosen": 1.9610732793807983, "rewards/margins": -1.678463101387024, "rewards/rejected": 3.6395363807678223, "step": 4412 }, { "epoch": 0.98, "learning_rate": 5.431413972553163e-06, "logits/chosen": -1.4241342544555664, "logits/rejected": -1.3182748556137085, "logps/chosen": -61.716094970703125, "logps/rejected": -43.72388458251953, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 7.247746467590332, "rewards/margins": 2.2886500358581543, "rewards/rejected": 4.959096431732178, "step": 4413 }, { "epoch": 0.98, "learning_rate": 5.42962830095101e-06, "logits/chosen": -1.6404560804367065, "logits/rejected": -1.6216909885406494, "logps/chosen": -46.86024475097656, "logps/rejected": -31.363178253173828, "loss": 2.368, "rewards/accuracies": 1.0, "rewards/chosen": 2.166778564453125, "rewards/margins": 0.14446520805358887, "rewards/rejected": 2.022313356399536, "step": 4414 }, { "epoch": 0.98, "learning_rate": 5.427842574142651e-06, "logits/chosen": -1.596114993095398, "logits/rejected": -1.4849636554718018, "logps/chosen": -150.8987579345703, "logps/rejected": -129.85055541992188, "loss": 0.7611, "rewards/accuracies": 0.0, "rewards/chosen": 6.062852382659912, "rewards/margins": -1.1431384086608887, "rewards/rejected": 7.205990791320801, "step": 4415 }, { "epoch": 0.98, "learning_rate": 5.426056792357552e-06, "logits/chosen": -1.6139219999313354, "logits/rejected": -1.542503833770752, "logps/chosen": -43.56725311279297, "logps/rejected": -23.729446411132812, "loss": 1.8897, "rewards/accuracies": 1.0, "rewards/chosen": 3.635451555252075, "rewards/margins": 2.8747076988220215, "rewards/rejected": 0.7607437372207642, "step": 4416 }, { "epoch": 0.98, "learning_rate": 5.424270955825176e-06, "logits/chosen": -1.8432471752166748, "logits/rejected": -1.8230838775634766, "logps/chosen": -31.367717742919922, "logps/rejected": -5.56282377243042, "loss": 0.8752, "rewards/accuracies": 1.0, "rewards/chosen": 1.66107976436615, "rewards/margins": 0.6680867671966553, "rewards/rejected": 0.9929929971694946, "step": 4417 }, { "epoch": 0.98, "learning_rate": 5.422485064775004e-06, "logits/chosen": -1.4012378454208374, "logits/rejected": -1.3838247060775757, "logps/chosen": -56.66117858886719, "logps/rejected": -65.19294738769531, "loss": 0.7781, "rewards/accuracies": 0.0, "rewards/chosen": 3.547595977783203, "rewards/margins": -1.3104815483093262, "rewards/rejected": 4.858077526092529, "step": 4418 }, { "epoch": 0.98, "learning_rate": 5.420699119436516e-06, "logits/chosen": -1.553876519203186, "logits/rejected": -1.5319350957870483, "logps/chosen": -78.78352355957031, "logps/rejected": -57.760066986083984, "loss": 2.108, "rewards/accuracies": 0.0, "rewards/chosen": 2.9666359424591064, "rewards/margins": -1.9999706745147705, "rewards/rejected": 4.966606616973877, "step": 4419 }, { "epoch": 0.98, "learning_rate": 5.418913120039203e-06, "logits/chosen": -1.8082095384597778, "logits/rejected": -1.781722903251648, "logps/chosen": -75.52781677246094, "logps/rejected": -28.21607208251953, "loss": 0.2576, "rewards/accuracies": 1.0, "rewards/chosen": 3.6909377574920654, "rewards/margins": 0.9664297103881836, "rewards/rejected": 2.724508047103882, "step": 4420 }, { "epoch": 0.98, "learning_rate": 5.417127066812561e-06, "logits/chosen": -1.7244973182678223, "logits/rejected": -1.7244973182678223, "logps/chosen": -79.32530212402344, "logps/rejected": -79.32530212402344, "loss": 0.3498, "rewards/accuracies": 0.0, "rewards/chosen": 4.777279853820801, "rewards/margins": 0.0, "rewards/rejected": 4.777279853820801, "step": 4421 }, { "epoch": 0.98, "learning_rate": 5.415340959986094e-06, "logits/chosen": -1.8213059902191162, "logits/rejected": -1.8191701173782349, "logps/chosen": -95.43440246582031, "logps/rejected": -40.033966064453125, "loss": 0.6282, "rewards/accuracies": 0.0, "rewards/chosen": 2.4582955837249756, "rewards/margins": -0.8668646812438965, "rewards/rejected": 3.325160264968872, "step": 4422 }, { "epoch": 0.98, "learning_rate": 5.413554799789312e-06, "logits/chosen": -1.673609733581543, "logits/rejected": -1.6561520099639893, "logps/chosen": -20.54852867126465, "logps/rejected": -58.00867462158203, "loss": 0.2488, "rewards/accuracies": 1.0, "rewards/chosen": 1.7349134683609009, "rewards/margins": 0.45486581325531006, "rewards/rejected": 1.2800476551055908, "step": 4423 }, { "epoch": 0.98, "learning_rate": 5.4117685864517334e-06, "logits/chosen": -1.5406452417373657, "logits/rejected": -1.5195460319519043, "logps/chosen": -104.52909088134766, "logps/rejected": -103.56998443603516, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": 8.000530242919922, "rewards/margins": 2.462716579437256, "rewards/rejected": 5.537813663482666, "step": 4424 }, { "epoch": 0.98, "learning_rate": 5.4099823202028814e-06, "logits/chosen": -1.5114442110061646, "logits/rejected": -1.3370527029037476, "logps/chosen": -77.86549377441406, "logps/rejected": -20.94826889038086, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 3.85406494140625, "rewards/margins": 2.7511661052703857, "rewards/rejected": 1.1028988361358643, "step": 4425 }, { "epoch": 0.98, "learning_rate": 5.408196001272284e-06, "logits/chosen": -1.4075450897216797, "logits/rejected": -1.3824872970581055, "logps/chosen": -70.2069320678711, "logps/rejected": -54.13368606567383, "loss": 3.1798, "rewards/accuracies": 0.0, "rewards/chosen": 2.26631236076355, "rewards/margins": -0.0034749507904052734, "rewards/rejected": 2.269787311553955, "step": 4426 }, { "epoch": 0.98, "learning_rate": 5.406409629889484e-06, "logits/chosen": -1.538522481918335, "logits/rejected": -1.538522481918335, "logps/chosen": -79.39424133300781, "logps/rejected": -79.39424133300781, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 3.984541416168213, "rewards/margins": 0.0, "rewards/rejected": 3.984541416168213, "step": 4427 }, { "epoch": 0.98, "learning_rate": 5.404623206284023e-06, "logits/chosen": -1.639748215675354, "logits/rejected": -1.639125943183899, "logps/chosen": -111.18415832519531, "logps/rejected": -86.93457794189453, "loss": 1.6381, "rewards/accuracies": 0.0, "rewards/chosen": 7.893782138824463, "rewards/margins": -3.2073464393615723, "rewards/rejected": 11.101128578186035, "step": 4428 }, { "epoch": 0.98, "learning_rate": 5.402836730685452e-06, "logits/chosen": -1.441555142402649, "logits/rejected": -1.3899424076080322, "logps/chosen": -38.263587951660156, "logps/rejected": -42.69929504394531, "loss": 1.097, "rewards/accuracies": 1.0, "rewards/chosen": 2.56510329246521, "rewards/margins": 1.3097089529037476, "rewards/rejected": 1.2553943395614624, "step": 4429 }, { "epoch": 0.98, "learning_rate": 5.40105020332333e-06, "logits/chosen": -1.5601484775543213, "logits/rejected": -1.5424779653549194, "logps/chosen": -38.17511749267578, "logps/rejected": -34.650306701660156, "loss": 1.2794, "rewards/accuracies": 0.0, "rewards/chosen": 1.5561569929122925, "rewards/margins": -2.4719648361206055, "rewards/rejected": 4.0281219482421875, "step": 4430 }, { "epoch": 0.98, "learning_rate": 5.399263624427221e-06, "logits/chosen": -1.6668354272842407, "logits/rejected": -1.5706874132156372, "logps/chosen": -54.09264373779297, "logps/rejected": -23.791553497314453, "loss": 0.1645, "rewards/accuracies": 1.0, "rewards/chosen": 2.242154836654663, "rewards/margins": 0.9799230098724365, "rewards/rejected": 1.2622318267822266, "step": 4431 }, { "epoch": 0.98, "learning_rate": 5.397476994226697e-06, "logits/chosen": -1.5861026048660278, "logits/rejected": -1.5839585065841675, "logps/chosen": -26.791471481323242, "logps/rejected": -27.804298400878906, "loss": 1.3009, "rewards/accuracies": 0.0, "rewards/chosen": 1.0005651712417603, "rewards/margins": -1.0420106649398804, "rewards/rejected": 2.0425758361816406, "step": 4432 }, { "epoch": 0.98, "learning_rate": 5.3956903129513335e-06, "logits/chosen": -2.185502529144287, "logits/rejected": -2.19871187210083, "logps/chosen": -52.4708366394043, "logps/rejected": -68.33674621582031, "loss": 1.5495, "rewards/accuracies": 0.0, "rewards/chosen": 3.261338472366333, "rewards/margins": -0.848501443862915, "rewards/rejected": 4.109839916229248, "step": 4433 }, { "epoch": 0.98, "learning_rate": 5.393903580830716e-06, "logits/chosen": -1.736804723739624, "logits/rejected": -1.6803261041641235, "logps/chosen": -56.14164733886719, "logps/rejected": -17.225688934326172, "loss": 0.1875, "rewards/accuracies": 1.0, "rewards/chosen": 2.6017684936523438, "rewards/margins": 0.7887908220291138, "rewards/rejected": 1.81297767162323, "step": 4434 }, { "epoch": 0.98, "learning_rate": 5.392116798094436e-06, "logits/chosen": -1.641462802886963, "logits/rejected": -1.549187421798706, "logps/chosen": -186.23255920410156, "logps/rejected": -151.95152282714844, "loss": 0.3237, "rewards/accuracies": 1.0, "rewards/chosen": 7.683271884918213, "rewards/margins": 0.3047208786010742, "rewards/rejected": 7.378551006317139, "step": 4435 }, { "epoch": 0.98, "learning_rate": 5.3903299649720895e-06, "logits/chosen": -1.616420030593872, "logits/rejected": -1.5610158443450928, "logps/chosen": -95.19982147216797, "logps/rejected": -56.60651779174805, "loss": 0.5689, "rewards/accuracies": 0.0, "rewards/chosen": 4.021205902099609, "rewards/margins": -0.17750072479248047, "rewards/rejected": 4.19870662689209, "step": 4436 }, { "epoch": 0.98, "learning_rate": 5.388543081693281e-06, "logits/chosen": -1.5687071084976196, "logits/rejected": -1.4891746044158936, "logps/chosen": -62.44171905517578, "logps/rejected": -8.39327621459961, "loss": 0.6693, "rewards/accuracies": 1.0, "rewards/chosen": 1.1224098205566406, "rewards/margins": 0.09774935245513916, "rewards/rejected": 1.0246604681015015, "step": 4437 }, { "epoch": 0.98, "learning_rate": 5.38675614848762e-06, "logits/chosen": -1.8061139583587646, "logits/rejected": -1.7186261415481567, "logps/chosen": -74.16653442382812, "logps/rejected": -49.29802703857422, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": 6.693747043609619, "rewards/margins": 3.2809884548187256, "rewards/rejected": 3.4127585887908936, "step": 4438 }, { "epoch": 0.98, "learning_rate": 5.384969165584725e-06, "logits/chosen": -1.433240532875061, "logits/rejected": -1.433240532875061, "logps/chosen": -16.211668014526367, "logps/rejected": -16.211668014526367, "loss": 0.9381, "rewards/accuracies": 0.0, "rewards/chosen": 1.8469133377075195, "rewards/margins": 0.0, "rewards/rejected": 1.8469133377075195, "step": 4439 }, { "epoch": 0.98, "learning_rate": 5.383182133214218e-06, "logits/chosen": -1.4592853784561157, "logits/rejected": -1.42557692527771, "logps/chosen": -140.95291137695312, "logps/rejected": -65.53619384765625, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": 6.43751859664917, "rewards/margins": 2.7775704860687256, "rewards/rejected": 3.6599481105804443, "step": 4440 }, { "epoch": 0.98, "learning_rate": 5.381395051605727e-06, "logits/chosen": -1.845862627029419, "logits/rejected": -1.8384143114089966, "logps/chosen": -134.81884765625, "logps/rejected": -134.625244140625, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 9.6458158493042, "rewards/margins": 3.505282402038574, "rewards/rejected": 6.140533447265625, "step": 4441 }, { "epoch": 0.98, "learning_rate": 5.379607920988887e-06, "logits/chosen": -1.824387788772583, "logits/rejected": -1.8374099731445312, "logps/chosen": -55.56648254394531, "logps/rejected": -40.41754150390625, "loss": 0.4573, "rewards/accuracies": 0.0, "rewards/chosen": 3.288191318511963, "rewards/margins": -0.39937424659729004, "rewards/rejected": 3.687565565109253, "step": 4442 }, { "epoch": 0.98, "learning_rate": 5.377820741593343e-06, "logits/chosen": -1.68976891040802, "logits/rejected": -1.5064791440963745, "logps/chosen": -146.259765625, "logps/rejected": -45.5579719543457, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 7.122575283050537, "rewards/margins": 5.35300874710083, "rewards/rejected": 1.7695664167404175, "step": 4443 }, { "epoch": 0.98, "learning_rate": 5.376033513648743e-06, "logits/chosen": -2.060338020324707, "logits/rejected": -1.9754478931427002, "logps/chosen": -135.13973999023438, "logps/rejected": -61.52363586425781, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": 7.0431365966796875, "rewards/margins": 1.9716520309448242, "rewards/rejected": 5.071484565734863, "step": 4444 }, { "epoch": 0.98, "learning_rate": 5.374246237384739e-06, "logits/chosen": -1.4461650848388672, "logits/rejected": -1.3840190172195435, "logps/chosen": -36.91807174682617, "logps/rejected": -57.037330627441406, "loss": 0.5243, "rewards/accuracies": 0.0, "rewards/chosen": 2.596191883087158, "rewards/margins": -0.5906772613525391, "rewards/rejected": 3.1868691444396973, "step": 4445 }, { "epoch": 0.98, "learning_rate": 5.372458913030996e-06, "logits/chosen": -1.6821801662445068, "logits/rejected": -1.653705358505249, "logps/chosen": -119.12263488769531, "logps/rejected": -31.52132225036621, "loss": 0.6975, "rewards/accuracies": 1.0, "rewards/chosen": 6.447540283203125, "rewards/margins": 3.5378148555755615, "rewards/rejected": 2.9097254276275635, "step": 4446 }, { "epoch": 0.98, "learning_rate": 5.370671540817175e-06, "logits/chosen": -1.4026437997817993, "logits/rejected": -1.390810489654541, "logps/chosen": -37.432613372802734, "logps/rejected": -37.59330368041992, "loss": 0.9673, "rewards/accuracies": 0.0, "rewards/chosen": 2.317115545272827, "rewards/margins": -1.7590653896331787, "rewards/rejected": 4.076180934906006, "step": 4447 }, { "epoch": 0.98, "learning_rate": 5.368884120972953e-06, "logits/chosen": -1.4232014417648315, "logits/rejected": -1.443976640701294, "logps/chosen": -50.92491912841797, "logps/rejected": -82.29951477050781, "loss": 0.6674, "rewards/accuracies": 0.0, "rewards/chosen": 3.5074546337127686, "rewards/margins": -0.8840920925140381, "rewards/rejected": 4.391546726226807, "step": 4448 }, { "epoch": 0.98, "learning_rate": 5.36709665372801e-06, "logits/chosen": -1.4264812469482422, "logits/rejected": -1.4393126964569092, "logps/chosen": -167.77862548828125, "logps/rejected": -73.86918640136719, "loss": 0.1505, "rewards/accuracies": 1.0, "rewards/chosen": 7.029994487762451, "rewards/margins": 1.0787630081176758, "rewards/rejected": 5.951231479644775, "step": 4449 }, { "epoch": 0.98, "learning_rate": 5.365309139312027e-06, "logits/chosen": -1.904641032218933, "logits/rejected": -1.826813817024231, "logps/chosen": -63.355567932128906, "logps/rejected": -13.18299674987793, "loss": 0.3888, "rewards/accuracies": 1.0, "rewards/chosen": 4.186557769775391, "rewards/margins": 1.6585328578948975, "rewards/rejected": 2.528024911880493, "step": 4450 }, { "epoch": 0.99, "learning_rate": 5.363521577954698e-06, "logits/chosen": -1.2388373613357544, "logits/rejected": -1.2425882816314697, "logps/chosen": -55.96399688720703, "logps/rejected": -56.210697174072266, "loss": 0.5999, "rewards/accuracies": 1.0, "rewards/chosen": 2.2171075344085693, "rewards/margins": 0.5774823427200317, "rewards/rejected": 1.6396251916885376, "step": 4451 }, { "epoch": 0.99, "learning_rate": 5.361733969885721e-06, "logits/chosen": -1.7229365110397339, "logits/rejected": -1.6999890804290771, "logps/chosen": -106.35491943359375, "logps/rejected": -95.10289001464844, "loss": 1.2907, "rewards/accuracies": 0.0, "rewards/chosen": 6.374426364898682, "rewards/margins": -2.497976779937744, "rewards/rejected": 8.872403144836426, "step": 4452 }, { "epoch": 0.99, "learning_rate": 5.359946315334797e-06, "logits/chosen": -1.9966871738433838, "logits/rejected": -1.8753389120101929, "logps/chosen": -115.74214172363281, "logps/rejected": -78.41362762451172, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 8.128417015075684, "rewards/margins": 1.4796781539916992, "rewards/rejected": 6.648738861083984, "step": 4453 }, { "epoch": 0.99, "learning_rate": 5.358158614531639e-06, "logits/chosen": -1.556858777999878, "logits/rejected": -1.5444355010986328, "logps/chosen": -87.20838928222656, "logps/rejected": -64.5391616821289, "loss": 0.0727, "rewards/accuracies": 1.0, "rewards/chosen": 4.040199279785156, "rewards/margins": 2.0794363021850586, "rewards/rejected": 1.960762858390808, "step": 4454 }, { "epoch": 0.99, "learning_rate": 5.3563708677059575e-06, "logits/chosen": -1.4333420991897583, "logits/rejected": -1.4333420991897583, "logps/chosen": -40.80414962768555, "logps/rejected": -40.80414962768555, "loss": 0.659, "rewards/accuracies": 0.0, "rewards/chosen": 3.1711132526397705, "rewards/margins": 0.0, "rewards/rejected": 3.1711132526397705, "step": 4455 }, { "epoch": 0.99, "learning_rate": 5.354583075087479e-06, "logits/chosen": -1.678676962852478, "logits/rejected": -1.6298741102218628, "logps/chosen": -63.85432815551758, "logps/rejected": -50.40315246582031, "loss": 0.4415, "rewards/accuracies": 0.0, "rewards/chosen": 3.3388562202453613, "rewards/margins": -0.2952463626861572, "rewards/rejected": 3.6341025829315186, "step": 4456 }, { "epoch": 0.99, "learning_rate": 5.352795236905925e-06, "logits/chosen": -1.5731430053710938, "logits/rejected": -1.352842926979065, "logps/chosen": -79.4386215209961, "logps/rejected": -49.080589294433594, "loss": 0.1672, "rewards/accuracies": 1.0, "rewards/chosen": 2.804358720779419, "rewards/margins": 1.8495136499404907, "rewards/rejected": 0.9548450708389282, "step": 4457 }, { "epoch": 0.99, "learning_rate": 5.3510073533910344e-06, "logits/chosen": -1.5565170049667358, "logits/rejected": -1.4598015546798706, "logps/chosen": -43.50117492675781, "logps/rejected": -6.406103134155273, "loss": 1.3869, "rewards/accuracies": 1.0, "rewards/chosen": 5.086818218231201, "rewards/margins": 4.2451863288879395, "rewards/rejected": 0.8416320085525513, "step": 4458 }, { "epoch": 0.99, "learning_rate": 5.34921942477254e-06, "logits/chosen": -1.404848575592041, "logits/rejected": -1.3860646486282349, "logps/chosen": -45.46277618408203, "logps/rejected": -60.75677490234375, "loss": 2.3732, "rewards/accuracies": 0.0, "rewards/chosen": 2.991748094558716, "rewards/margins": -2.745832681655884, "rewards/rejected": 5.7375807762146, "step": 4459 }, { "epoch": 0.99, "learning_rate": 5.3474314512801905e-06, "logits/chosen": -1.5273107290267944, "logits/rejected": -1.5387487411499023, "logps/chosen": -207.32630920410156, "logps/rejected": -126.40526580810547, "loss": 1.2564, "rewards/accuracies": 1.0, "rewards/chosen": 8.043105125427246, "rewards/margins": 1.2684245109558105, "rewards/rejected": 6.7746806144714355, "step": 4460 }, { "epoch": 0.99, "learning_rate": 5.3456434331437335e-06, "logits/chosen": -1.7833653688430786, "logits/rejected": -1.8148036003112793, "logps/chosen": -49.629493713378906, "logps/rejected": -63.92395782470703, "loss": 1.8764, "rewards/accuracies": 0.0, "rewards/chosen": 3.625084638595581, "rewards/margins": -3.349212884902954, "rewards/rejected": 6.974297523498535, "step": 4461 }, { "epoch": 0.99, "learning_rate": 5.343855370592927e-06, "logits/chosen": -1.2777410745620728, "logits/rejected": -1.207269310951233, "logps/chosen": -57.40789031982422, "logps/rejected": -31.902420043945312, "loss": 0.4475, "rewards/accuracies": 0.0, "rewards/chosen": 1.8803528547286987, "rewards/margins": -0.08175694942474365, "rewards/rejected": 1.9621098041534424, "step": 4462 }, { "epoch": 0.99, "learning_rate": 5.342067263857531e-06, "logits/chosen": -2.0672688484191895, "logits/rejected": -2.0561275482177734, "logps/chosen": -28.054481506347656, "logps/rejected": -84.16988372802734, "loss": 0.989, "rewards/accuracies": 0.0, "rewards/chosen": 2.811108350753784, "rewards/margins": -0.7339577674865723, "rewards/rejected": 3.5450661182403564, "step": 4463 }, { "epoch": 0.99, "learning_rate": 5.340279113167315e-06, "logits/chosen": -1.5786972045898438, "logits/rejected": -1.537284016609192, "logps/chosen": -115.01840209960938, "logps/rejected": -81.7275619506836, "loss": 0.2611, "rewards/accuracies": 1.0, "rewards/chosen": 3.87619948387146, "rewards/margins": 0.7105979919433594, "rewards/rejected": 3.1656014919281006, "step": 4464 }, { "epoch": 0.99, "learning_rate": 5.338490918752051e-06, "logits/chosen": -1.6587451696395874, "logits/rejected": -1.6587451696395874, "logps/chosen": -40.01066207885742, "logps/rejected": -40.01066207885742, "loss": 0.4525, "rewards/accuracies": 0.0, "rewards/chosen": 4.785346031188965, "rewards/margins": 0.0, "rewards/rejected": 4.785346031188965, "step": 4465 }, { "epoch": 0.99, "learning_rate": 5.3367026808415176e-06, "logits/chosen": -1.7627487182617188, "logits/rejected": -1.7339198589324951, "logps/chosen": -46.80304718017578, "logps/rejected": -36.7889518737793, "loss": 0.0894, "rewards/accuracies": 1.0, "rewards/chosen": 3.1872735023498535, "rewards/margins": 2.1764707565307617, "rewards/rejected": 1.0108028650283813, "step": 4466 }, { "epoch": 0.99, "learning_rate": 5.334914399665499e-06, "logits/chosen": -1.453935146331787, "logits/rejected": -1.4148870706558228, "logps/chosen": -57.52361297607422, "logps/rejected": -42.895545959472656, "loss": 0.3923, "rewards/accuracies": 1.0, "rewards/chosen": 3.138570547103882, "rewards/margins": 1.7575501203536987, "rewards/rejected": 1.381020426750183, "step": 4467 }, { "epoch": 0.99, "learning_rate": 5.333126075453785e-06, "logits/chosen": -1.9053572416305542, "logits/rejected": -1.8986724615097046, "logps/chosen": -37.62369918823242, "logps/rejected": -44.614845275878906, "loss": 1.6985, "rewards/accuracies": 0.0, "rewards/chosen": 1.5913848876953125, "rewards/margins": -1.9269731044769287, "rewards/rejected": 3.518357992172241, "step": 4468 }, { "epoch": 0.99, "learning_rate": 5.33133770843617e-06, "logits/chosen": -1.6560925245285034, "logits/rejected": -1.6560925245285034, "logps/chosen": -40.630096435546875, "logps/rejected": -40.630096435546875, "loss": 0.4393, "rewards/accuracies": 0.0, "rewards/chosen": 2.634533643722534, "rewards/margins": 0.0, "rewards/rejected": 2.634533643722534, "step": 4469 }, { "epoch": 0.99, "learning_rate": 5.3295492988424576e-06, "logits/chosen": -1.757859230041504, "logits/rejected": -1.7584130764007568, "logps/chosen": -34.643028259277344, "logps/rejected": -76.60331726074219, "loss": 0.3341, "rewards/accuracies": 1.0, "rewards/chosen": 3.038928985595703, "rewards/margins": 0.39630651473999023, "rewards/rejected": 2.642622470855713, "step": 4470 }, { "epoch": 0.99, "learning_rate": 5.3277608469024515e-06, "logits/chosen": -1.6320390701293945, "logits/rejected": -1.5371896028518677, "logps/chosen": -163.044189453125, "logps/rejected": -17.284236907958984, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 7.826345920562744, "rewards/margins": 6.7193450927734375, "rewards/rejected": 1.107000708580017, "step": 4471 }, { "epoch": 0.99, "learning_rate": 5.325972352845965e-06, "logits/chosen": -1.476972222328186, "logits/rejected": -1.4638737440109253, "logps/chosen": -25.308128356933594, "logps/rejected": -58.32970428466797, "loss": 0.8859, "rewards/accuracies": 1.0, "rewards/chosen": 2.841797351837158, "rewards/margins": 0.463700532913208, "rewards/rejected": 2.37809681892395, "step": 4472 }, { "epoch": 0.99, "learning_rate": 5.324183816902814e-06, "logits/chosen": -1.4919732809066772, "logits/rejected": -1.4919732809066772, "logps/chosen": -45.38355255126953, "logps/rejected": -45.38355255126953, "loss": 0.4158, "rewards/accuracies": 0.0, "rewards/chosen": 2.224438428878784, "rewards/margins": 0.0, "rewards/rejected": 2.224438428878784, "step": 4473 }, { "epoch": 0.99, "learning_rate": 5.322395239302823e-06, "logits/chosen": -1.7382447719573975, "logits/rejected": -1.737432837486267, "logps/chosen": -43.787109375, "logps/rejected": -30.472719192504883, "loss": 0.7653, "rewards/accuracies": 0.0, "rewards/chosen": 1.8563072681427002, "rewards/margins": -1.2845234870910645, "rewards/rejected": 3.1408307552337646, "step": 4474 }, { "epoch": 0.99, "learning_rate": 5.320606620275819e-06, "logits/chosen": -2.051743984222412, "logits/rejected": -1.946904182434082, "logps/chosen": -109.19084930419922, "logps/rejected": -66.085205078125, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 7.6147990226745605, "rewards/margins": 3.008699893951416, "rewards/rejected": 4.6060991287231445, "step": 4475 }, { "epoch": 0.99, "learning_rate": 5.318817960051636e-06, "logits/chosen": -1.529650330543518, "logits/rejected": -1.4094599485397339, "logps/chosen": -106.73658752441406, "logps/rejected": -37.47954177856445, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": 6.432872295379639, "rewards/margins": 3.624164342880249, "rewards/rejected": 2.8087079524993896, "step": 4476 }, { "epoch": 0.99, "learning_rate": 5.317029258860109e-06, "logits/chosen": -1.8804230690002441, "logits/rejected": -1.735643744468689, "logps/chosen": -123.60345458984375, "logps/rejected": -51.820762634277344, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": 7.268353462219238, "rewards/margins": 4.59067440032959, "rewards/rejected": 2.6776788234710693, "step": 4477 }, { "epoch": 0.99, "learning_rate": 5.315240516931085e-06, "logits/chosen": -1.466472864151001, "logits/rejected": -1.4820899963378906, "logps/chosen": -30.857330322265625, "logps/rejected": -39.311607360839844, "loss": 4.3368, "rewards/accuracies": 0.0, "rewards/chosen": 1.6031116247177124, "rewards/margins": -6.358604907989502, "rewards/rejected": 7.961716651916504, "step": 4478 }, { "epoch": 0.99, "learning_rate": 5.313451734494413e-06, "logits/chosen": -1.6824324131011963, "logits/rejected": -1.754408597946167, "logps/chosen": -53.97905731201172, "logps/rejected": -111.39002990722656, "loss": 1.9557, "rewards/accuracies": 0.0, "rewards/chosen": 2.884502410888672, "rewards/margins": -1.0396599769592285, "rewards/rejected": 3.9241623878479004, "step": 4479 }, { "epoch": 0.99, "learning_rate": 5.311662911779945e-06, "logits/chosen": -1.4808460474014282, "logits/rejected": -1.3345818519592285, "logps/chosen": -57.27676010131836, "logps/rejected": -79.10191345214844, "loss": 0.6054, "rewards/accuracies": 0.0, "rewards/chosen": 3.5393223762512207, "rewards/margins": -0.8522272109985352, "rewards/rejected": 4.391549587249756, "step": 4480 }, { "epoch": 0.99, "learning_rate": 5.309874049017543e-06, "logits/chosen": -1.6692802906036377, "logits/rejected": -1.6332789659500122, "logps/chosen": -65.12892150878906, "logps/rejected": -44.69611358642578, "loss": 0.9376, "rewards/accuracies": 0.0, "rewards/chosen": 1.9932793378829956, "rewards/margins": -0.0246657133102417, "rewards/rejected": 2.0179450511932373, "step": 4481 }, { "epoch": 0.99, "learning_rate": 5.308085146437072e-06, "logits/chosen": -1.2461333274841309, "logits/rejected": -1.2543011903762817, "logps/chosen": -63.981231689453125, "logps/rejected": -75.46570587158203, "loss": 0.7584, "rewards/accuracies": 0.0, "rewards/chosen": 2.5344529151916504, "rewards/margins": -1.258206844329834, "rewards/rejected": 3.7926597595214844, "step": 4482 }, { "epoch": 0.99, "learning_rate": 5.306296204268398e-06, "logits/chosen": -1.3916484117507935, "logits/rejected": -1.3088809251785278, "logps/chosen": -56.31692886352539, "logps/rejected": -53.71678161621094, "loss": 0.1991, "rewards/accuracies": 1.0, "rewards/chosen": 3.373734712600708, "rewards/margins": 0.8818981647491455, "rewards/rejected": 2.4918365478515625, "step": 4483 }, { "epoch": 0.99, "learning_rate": 5.304507222741399e-06, "logits/chosen": -1.5014736652374268, "logits/rejected": -1.4704258441925049, "logps/chosen": -66.0427474975586, "logps/rejected": -70.86275482177734, "loss": 2.6338, "rewards/accuracies": 0.0, "rewards/chosen": 2.635335683822632, "rewards/margins": -1.2926833629608154, "rewards/rejected": 3.9280190467834473, "step": 4484 }, { "epoch": 0.99, "learning_rate": 5.302718202085955e-06, "logits/chosen": -1.5223637819290161, "logits/rejected": -1.4697637557983398, "logps/chosen": -58.025848388671875, "logps/rejected": -23.532955169677734, "loss": 0.844, "rewards/accuracies": 1.0, "rewards/chosen": 3.3490912914276123, "rewards/margins": 1.9905810356140137, "rewards/rejected": 1.3585102558135986, "step": 4485 }, { "epoch": 0.99, "learning_rate": 5.30092914253195e-06, "logits/chosen": -1.6482782363891602, "logits/rejected": -1.562227487564087, "logps/chosen": -86.91413879394531, "logps/rejected": -78.3438949584961, "loss": 0.7679, "rewards/accuracies": 0.0, "rewards/chosen": 2.299264669418335, "rewards/margins": -1.2880866527557373, "rewards/rejected": 3.5873513221740723, "step": 4486 }, { "epoch": 0.99, "learning_rate": 5.299140044309272e-06, "logits/chosen": -1.6493397951126099, "logits/rejected": -1.5974446535110474, "logps/chosen": -70.61241149902344, "logps/rejected": -54.24990463256836, "loss": 1.1775, "rewards/accuracies": 0.0, "rewards/chosen": 1.8092094659805298, "rewards/margins": -1.9202443361282349, "rewards/rejected": 3.7294538021087646, "step": 4487 }, { "epoch": 0.99, "learning_rate": 5.297350907647818e-06, "logits/chosen": -1.7076107263565063, "logits/rejected": -1.7082369327545166, "logps/chosen": -104.72834014892578, "logps/rejected": -86.26172637939453, "loss": 0.1195, "rewards/accuracies": 1.0, "rewards/chosen": 6.7104058265686035, "rewards/margins": 2.1739883422851562, "rewards/rejected": 4.536417484283447, "step": 4488 }, { "epoch": 0.99, "learning_rate": 5.295561732777487e-06, "logits/chosen": -1.5119940042495728, "logits/rejected": -1.4276740550994873, "logps/chosen": -109.93240356445312, "logps/rejected": -178.19393920898438, "loss": 2.6652, "rewards/accuracies": 0.0, "rewards/chosen": 7.885435581207275, "rewards/margins": -1.7013840675354004, "rewards/rejected": 9.586819648742676, "step": 4489 }, { "epoch": 0.99, "learning_rate": 5.293772519928183e-06, "logits/chosen": -1.729381799697876, "logits/rejected": -1.6866230964660645, "logps/chosen": -60.09775924682617, "logps/rejected": -62.0987434387207, "loss": 1.1812, "rewards/accuracies": 0.0, "rewards/chosen": 2.0842716693878174, "rewards/margins": -2.108879804611206, "rewards/rejected": 4.193151473999023, "step": 4490 }, { "epoch": 0.99, "learning_rate": 5.291983269329819e-06, "logits/chosen": -1.425382137298584, "logits/rejected": -1.358181118965149, "logps/chosen": -47.497467041015625, "logps/rejected": -32.86040496826172, "loss": 1.191, "rewards/accuracies": 1.0, "rewards/chosen": 3.8643510341644287, "rewards/margins": 0.873812198638916, "rewards/rejected": 2.9905388355255127, "step": 4491 }, { "epoch": 0.99, "learning_rate": 5.290193981212305e-06, "logits/chosen": -1.612247109413147, "logits/rejected": -1.5329132080078125, "logps/chosen": -83.91834259033203, "logps/rejected": -64.693603515625, "loss": 0.1042, "rewards/accuracies": 1.0, "rewards/chosen": 7.266953468322754, "rewards/margins": 1.8866052627563477, "rewards/rejected": 5.380348205566406, "step": 4492 }, { "epoch": 0.99, "learning_rate": 5.288404655805561e-06, "logits/chosen": -1.8523179292678833, "logits/rejected": -1.755340814590454, "logps/chosen": -63.843650817871094, "logps/rejected": -81.08456420898438, "loss": 0.0709, "rewards/accuracies": 1.0, "rewards/chosen": 5.438488960266113, "rewards/margins": 2.2575998306274414, "rewards/rejected": 3.180889129638672, "step": 4493 }, { "epoch": 0.99, "learning_rate": 5.286615293339514e-06, "logits/chosen": -2.06191349029541, "logits/rejected": -2.0609970092773438, "logps/chosen": -63.338539123535156, "logps/rejected": -126.87037658691406, "loss": 2.9913, "rewards/accuracies": 0.0, "rewards/chosen": 2.1912620067596436, "rewards/margins": -5.644957542419434, "rewards/rejected": 7.836219787597656, "step": 4494 }, { "epoch": 0.99, "learning_rate": 5.28482589404409e-06, "logits/chosen": -1.5941851139068604, "logits/rejected": -1.5941851139068604, "logps/chosen": -31.123552322387695, "logps/rejected": -31.123552322387695, "loss": 0.4772, "rewards/accuracies": 0.0, "rewards/chosen": 2.25539231300354, "rewards/margins": 0.0, "rewards/rejected": 2.25539231300354, "step": 4495 }, { "epoch": 1.0, "learning_rate": 5.2830364581492235e-06, "logits/chosen": -1.492975115776062, "logits/rejected": -1.3520302772521973, "logps/chosen": -73.97115325927734, "logps/rejected": -10.648412704467773, "loss": 0.2377, "rewards/accuracies": 1.0, "rewards/chosen": 3.7836227416992188, "rewards/margins": 3.2473347187042236, "rewards/rejected": 0.5362880825996399, "step": 4496 }, { "epoch": 1.0, "learning_rate": 5.281246985884852e-06, "logits/chosen": -1.8180590867996216, "logits/rejected": -1.8657218217849731, "logps/chosen": -107.40318298339844, "logps/rejected": -105.25878143310547, "loss": 2.0631, "rewards/accuracies": 0.0, "rewards/chosen": 7.0152177810668945, "rewards/margins": -4.109663009643555, "rewards/rejected": 11.12488079071045, "step": 4497 }, { "epoch": 1.0, "learning_rate": 5.27945747748092e-06, "logits/chosen": -1.6333866119384766, "logits/rejected": -1.5771771669387817, "logps/chosen": -46.78744888305664, "logps/rejected": -20.76567840576172, "loss": 0.7865, "rewards/accuracies": 1.0, "rewards/chosen": 2.4570844173431396, "rewards/margins": 2.03267502784729, "rewards/rejected": 0.42440930008888245, "step": 4498 }, { "epoch": 1.0, "learning_rate": 5.277667933167373e-06, "logits/chosen": -1.6737040281295776, "logits/rejected": -1.6361994743347168, "logps/chosen": -107.62245178222656, "logps/rejected": -102.43679809570312, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 5.657936096191406, "rewards/margins": 2.4929213523864746, "rewards/rejected": 3.1650147438049316, "step": 4499 }, { "epoch": 1.0, "learning_rate": 5.2758783531741655e-06, "logits/chosen": -1.7226163148880005, "logits/rejected": -1.7226163148880005, "logps/chosen": -19.056201934814453, "logps/rejected": -19.056201934814453, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.3511730432510376, "rewards/margins": 0.0, "rewards/rejected": 1.3511730432510376, "step": 4500 }, { "epoch": 1.0, "learning_rate": 5.274088737731251e-06, "logits/chosen": -1.6351747512817383, "logits/rejected": -1.5787452459335327, "logps/chosen": -87.52970886230469, "logps/rejected": -45.15691375732422, "loss": 0.2486, "rewards/accuracies": 1.0, "rewards/chosen": 5.863826274871826, "rewards/margins": 0.6703195571899414, "rewards/rejected": 5.193506717681885, "step": 4501 }, { "epoch": 1.0, "learning_rate": 5.272299087068593e-06, "logits/chosen": -1.3174241781234741, "logits/rejected": -1.2653917074203491, "logps/chosen": -54.0836067199707, "logps/rejected": -95.48696899414062, "loss": 1.7272, "rewards/accuracies": 1.0, "rewards/chosen": 4.44230842590332, "rewards/margins": 0.9056627750396729, "rewards/rejected": 3.5366456508636475, "step": 4502 }, { "epoch": 1.0, "learning_rate": 5.270509401416156e-06, "logits/chosen": -2.101114273071289, "logits/rejected": -2.053584337234497, "logps/chosen": -91.5870132446289, "logps/rejected": -145.93023681640625, "loss": 1.9015, "rewards/accuracies": 0.0, "rewards/chosen": 6.215743541717529, "rewards/margins": -3.772132396697998, "rewards/rejected": 9.987875938415527, "step": 4503 }, { "epoch": 1.0, "learning_rate": 5.268719681003913e-06, "logits/chosen": -1.4325282573699951, "logits/rejected": -1.3205486536026, "logps/chosen": -125.20884704589844, "logps/rejected": -40.09663772583008, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 4.958906650543213, "rewards/margins": 4.649400234222412, "rewards/rejected": 0.30950623750686646, "step": 4504 }, { "epoch": 1.0, "learning_rate": 5.266929926061836e-06, "logits/chosen": -1.682922601699829, "logits/rejected": -1.5962251424789429, "logps/chosen": -155.86474609375, "logps/rejected": -175.3310089111328, "loss": 1.566, "rewards/accuracies": 0.0, "rewards/chosen": 6.413094997406006, "rewards/margins": -1.4048175811767578, "rewards/rejected": 7.817912578582764, "step": 4505 }, { "epoch": 1.0, "learning_rate": 5.265140136819905e-06, "logits/chosen": -1.4380515813827515, "logits/rejected": -1.2029439210891724, "logps/chosen": -36.8265495300293, "logps/rejected": -70.49755859375, "loss": 1.9007, "rewards/accuracies": 0.0, "rewards/chosen": 2.3709347248077393, "rewards/margins": -3.7373478412628174, "rewards/rejected": 6.108282566070557, "step": 4506 }, { "epoch": 1.0, "learning_rate": 5.263350313508105e-06, "logits/chosen": -2.0766804218292236, "logits/rejected": -2.054673433303833, "logps/chosen": -58.45155334472656, "logps/rejected": -93.2940673828125, "loss": 0.7875, "rewards/accuracies": 0.0, "rewards/chosen": 3.9784486293792725, "rewards/margins": -1.3396832942962646, "rewards/rejected": 5.318131923675537, "step": 4507 }, { "epoch": 1.0, "learning_rate": 5.261560456356424e-06, "logits/chosen": -1.4720754623413086, "logits/rejected": -1.326321005821228, "logps/chosen": -138.017578125, "logps/rejected": -39.38374710083008, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": 5.665759563446045, "rewards/margins": 3.207366704940796, "rewards/rejected": 2.458392858505249, "step": 4508 }, { "epoch": 1.0, "learning_rate": 5.259770565594851e-06, "logits/chosen": -1.814307451248169, "logits/rejected": -1.7220954895019531, "logps/chosen": -99.00702667236328, "logps/rejected": -46.161930084228516, "loss": 0.1828, "rewards/accuracies": 1.0, "rewards/chosen": 7.902364253997803, "rewards/margins": 3.6026620864868164, "rewards/rejected": 4.299702167510986, "step": 4509 }, { "epoch": 1.0, "learning_rate": 5.257980641453384e-06, "logits/chosen": -1.6111464500427246, "logits/rejected": -1.6111464500427246, "logps/chosen": -24.270524978637695, "logps/rejected": -24.270524978637695, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.02434387244284153, "rewards/margins": 0.0, "rewards/rejected": 0.02434387244284153, "step": 4510 }, { "epoch": 1.0, "learning_rate": 5.256190684162026e-06, "logits/chosen": -1.5298256874084473, "logits/rejected": -1.381235122680664, "logps/chosen": -128.3002471923828, "logps/rejected": -47.922794342041016, "loss": 0.1182, "rewards/accuracies": 1.0, "rewards/chosen": 5.537239074707031, "rewards/margins": 2.7904446125030518, "rewards/rejected": 2.7467944622039795, "step": 4511 }, { "epoch": 1.0, "learning_rate": 5.25440069395078e-06, "logits/chosen": -1.7378841638565063, "logits/rejected": -1.7131015062332153, "logps/chosen": -39.927635192871094, "logps/rejected": -65.68045043945312, "loss": 0.3095, "rewards/accuracies": 1.0, "rewards/chosen": 3.4363763332366943, "rewards/margins": 0.430330753326416, "rewards/rejected": 3.0060455799102783, "step": 4512 }, { "epoch": 1.0, "learning_rate": 5.252610671049657e-06, "logits/chosen": -1.5814008712768555, "logits/rejected": -1.5392783880233765, "logps/chosen": -52.033538818359375, "logps/rejected": -23.873899459838867, "loss": 0.4574, "rewards/accuracies": 1.0, "rewards/chosen": 1.9294251203536987, "rewards/margins": 0.03940725326538086, "rewards/rejected": 1.8900178670883179, "step": 4513 }, { "epoch": 1.0, "learning_rate": 5.25082061568867e-06, "logits/chosen": -1.7648924589157104, "logits/rejected": -1.7514127492904663, "logps/chosen": -68.82781982421875, "logps/rejected": -57.76947784423828, "loss": 0.7774, "rewards/accuracies": 0.0, "rewards/chosen": 3.0425751209259033, "rewards/margins": -1.2380998134613037, "rewards/rejected": 4.280674934387207, "step": 4514 }, { "epoch": 1.0, "learning_rate": 5.249030528097838e-06, "logits/chosen": -1.6004443168640137, "logits/rejected": -1.4472472667694092, "logps/chosen": -71.88482666015625, "logps/rejected": -58.33936309814453, "loss": 0.8269, "rewards/accuracies": 0.0, "rewards/chosen": 3.104963779449463, "rewards/margins": -1.250643253326416, "rewards/rejected": 4.355607032775879, "step": 4515 }, { "epoch": 1.0, "learning_rate": 5.24724040850718e-06, "logits/chosen": -1.6418248414993286, "logits/rejected": -1.4592310190200806, "logps/chosen": -73.30570983886719, "logps/rejected": -37.97210693359375, "loss": 0.0707, "rewards/accuracies": 1.0, "rewards/chosen": 7.04879903793335, "rewards/margins": 2.5570297241210938, "rewards/rejected": 4.491769313812256, "step": 4516 }, { "epoch": 1.0, "learning_rate": 5.245450257146726e-06, "logits/chosen": -1.6060445308685303, "logits/rejected": -1.6153826713562012, "logps/chosen": -40.4937858581543, "logps/rejected": -81.58523559570312, "loss": 0.2917, "rewards/accuracies": 1.0, "rewards/chosen": 3.302879810333252, "rewards/margins": 1.1608929634094238, "rewards/rejected": 2.141986846923828, "step": 4517 }, { "epoch": 1.0, "learning_rate": 5.243660074246503e-06, "logits/chosen": -1.6185665130615234, "logits/rejected": -1.5842235088348389, "logps/chosen": -61.71006774902344, "logps/rejected": -57.788700103759766, "loss": 1.6602, "rewards/accuracies": 0.0, "rewards/chosen": 2.3448073863983154, "rewards/margins": -0.34864163398742676, "rewards/rejected": 2.693449020385742, "step": 4518 }, { "epoch": 1.0, "learning_rate": 5.2418698600365445e-06, "logits/chosen": -1.8692622184753418, "logits/rejected": -1.8151479959487915, "logps/chosen": -140.10385131835938, "logps/rejected": -102.68107604980469, "loss": 0.1315, "rewards/accuracies": 1.0, "rewards/chosen": 7.935621738433838, "rewards/margins": 3.010922431945801, "rewards/rejected": 4.924699306488037, "step": 4519 }, { "epoch": 1.0, "learning_rate": 5.240079614746892e-06, "logits/chosen": -1.7285155057907104, "logits/rejected": -1.5773606300354004, "logps/chosen": -97.16712951660156, "logps/rejected": -88.1701431274414, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": 7.761244297027588, "rewards/margins": 2.3538780212402344, "rewards/rejected": 5.4073662757873535, "step": 4520 }, { "epoch": 1.0, "learning_rate": 5.238289338607585e-06, "logits/chosen": -1.6603668928146362, "logits/rejected": -1.5046429634094238, "logps/chosen": -124.30292510986328, "logps/rejected": -36.548439025878906, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 6.788512706756592, "rewards/margins": 4.496253967285156, "rewards/rejected": 2.2922585010528564, "step": 4521 }, { "epoch": 1.0, "learning_rate": 5.236499031848672e-06, "logits/chosen": -1.770809531211853, "logits/rejected": -1.824100375175476, "logps/chosen": -41.299949645996094, "logps/rejected": -104.86396789550781, "loss": 2.1758, "rewards/accuracies": 0.0, "rewards/chosen": 5.0829997062683105, "rewards/margins": -4.294785022735596, "rewards/rejected": 9.377784729003906, "step": 4522 }, { "epoch": 1.0, "learning_rate": 5.234708694700201e-06, "logits/chosen": -1.7535291910171509, "logits/rejected": -1.6027864217758179, "logps/chosen": -52.119537353515625, "logps/rejected": -21.066930770874023, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 4.848753452301025, "rewards/margins": 4.620112419128418, "rewards/rejected": 0.22864094376564026, "step": 4523 }, { "epoch": 1.0, "learning_rate": 5.232918327392229e-06, "logits/chosen": -1.920195460319519, "logits/rejected": -1.8920835256576538, "logps/chosen": -77.5069351196289, "logps/rejected": -92.31206512451172, "loss": 0.1362, "rewards/accuracies": 1.0, "rewards/chosen": 6.702027320861816, "rewards/margins": 1.2779998779296875, "rewards/rejected": 5.424027442932129, "step": 4524 }, { "epoch": 1.0, "learning_rate": 5.23112793015481e-06, "logits/chosen": -1.6109870672225952, "logits/rejected": -1.553392767906189, "logps/chosen": -34.30390167236328, "logps/rejected": -31.682708740234375, "loss": 0.2832, "rewards/accuracies": 1.0, "rewards/chosen": 1.924932837486267, "rewards/margins": 0.7298346757888794, "rewards/rejected": 1.1950981616973877, "step": 4525 }, { "epoch": 1.0, "learning_rate": 5.229337503218008e-06, "logits/chosen": -1.1691386699676514, "logits/rejected": -1.1691386699676514, "logps/chosen": -27.64438247680664, "logps/rejected": -27.64438247680664, "loss": 0.3735, "rewards/accuracies": 0.0, "rewards/chosen": 2.468842029571533, "rewards/margins": 0.0, "rewards/rejected": 2.468842029571533, "step": 4526 }, { "epoch": 1.0, "learning_rate": 5.22754704681189e-06, "logits/chosen": -1.2328168153762817, "logits/rejected": -1.2175294160842896, "logps/chosen": -70.93429565429688, "logps/rejected": -74.8565444946289, "loss": 2.7689, "rewards/accuracies": 0.0, "rewards/chosen": 3.6232306957244873, "rewards/margins": -3.204364061355591, "rewards/rejected": 6.827594757080078, "step": 4527 }, { "epoch": 1.0, "learning_rate": 5.225756561166521e-06, "logits/chosen": -1.7766448259353638, "logits/rejected": -1.675167202949524, "logps/chosen": -93.16206359863281, "logps/rejected": -121.29725646972656, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 10.91911792755127, "rewards/margins": 3.5931944847106934, "rewards/rejected": 7.325923442840576, "step": 4528 }, { "epoch": 1.0, "learning_rate": 5.2239660465119765e-06, "logits/chosen": -1.638716697692871, "logits/rejected": -1.650230050086975, "logps/chosen": -59.024749755859375, "logps/rejected": -64.69149780273438, "loss": 1.2036, "rewards/accuracies": 0.0, "rewards/chosen": 3.4698379039764404, "rewards/margins": -1.2704384326934814, "rewards/rejected": 4.740276336669922, "step": 4529 }, { "epoch": 1.0, "learning_rate": 5.222175503078334e-06, "logits/chosen": -1.7539938688278198, "logits/rejected": -1.7539938688278198, "logps/chosen": -64.15399169921875, "logps/rejected": -64.15399169921875, "loss": 0.3504, "rewards/accuracies": 0.0, "rewards/chosen": 5.774852275848389, "rewards/margins": 0.0, "rewards/rejected": 5.774852275848389, "step": 4530 }, { "epoch": 1.0, "learning_rate": 5.220384931095674e-06, "logits/chosen": -1.6238548755645752, "logits/rejected": -1.557750940322876, "logps/chosen": -102.01161193847656, "logps/rejected": -53.93338394165039, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": 8.266512870788574, "rewards/margins": 3.0241503715515137, "rewards/rejected": 5.2423624992370605, "step": 4531 }, { "epoch": 1.0, "learning_rate": 5.218594330794079e-06, "logits/chosen": -1.6815320253372192, "logits/rejected": -1.647128939628601, "logps/chosen": -54.7645263671875, "logps/rejected": -38.757118225097656, "loss": 0.6041, "rewards/accuracies": 1.0, "rewards/chosen": 5.6010355949401855, "rewards/margins": 2.677173137664795, "rewards/rejected": 2.9238624572753906, "step": 4532 }, { "epoch": 1.0, "learning_rate": 5.21680370240364e-06, "logits/chosen": -1.9232419729232788, "logits/rejected": -1.954194188117981, "logps/chosen": -205.82916259765625, "logps/rejected": -65.53544616699219, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 7.756024360656738, "rewards/margins": 3.7297563552856445, "rewards/rejected": 4.026268005371094, "step": 4533 }, { "epoch": 1.0, "learning_rate": 5.215013046154445e-06, "logits/chosen": -1.6176784038543701, "logits/rejected": -1.5980322360992432, "logps/chosen": -21.82866859436035, "logps/rejected": -66.56616973876953, "loss": 0.3139, "rewards/accuracies": 1.0, "rewards/chosen": 2.281938076019287, "rewards/margins": 0.22434139251708984, "rewards/rejected": 2.0575966835021973, "step": 4534 }, { "epoch": 1.0, "learning_rate": 5.21322236227659e-06, "logits/chosen": -1.7267777919769287, "logits/rejected": -1.7247166633605957, "logps/chosen": -102.12003326416016, "logps/rejected": -86.9815673828125, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": 8.049729347229004, "rewards/margins": 4.813572883605957, "rewards/rejected": 3.236156463623047, "step": 4535 }, { "epoch": 1.0, "learning_rate": 5.211431651000174e-06, "logits/chosen": -1.6369836330413818, "logits/rejected": -1.561259150505066, "logps/chosen": -106.83695983886719, "logps/rejected": -76.14496612548828, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 6.891114711761475, "rewards/margins": 5.689952850341797, "rewards/rejected": 1.2011619806289673, "step": 4536 }, { "epoch": 1.0, "learning_rate": 5.209640912555301e-06, "logits/chosen": -2.0587236881256104, "logits/rejected": -1.9545506238937378, "logps/chosen": -120.41411590576172, "logps/rejected": -15.16743278503418, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 7.241458892822266, "rewards/margins": 4.449982643127441, "rewards/rejected": 2.791476249694824, "step": 4537 }, { "epoch": 1.0, "learning_rate": 5.207850147172073e-06, "logits/chosen": -1.6132125854492188, "logits/rejected": -1.6977146863937378, "logps/chosen": -53.0792350769043, "logps/rejected": -99.05299377441406, "loss": 0.2754, "rewards/accuracies": 1.0, "rewards/chosen": 6.978402614593506, "rewards/margins": 0.33957338333129883, "rewards/rejected": 6.638829231262207, "step": 4538 }, { "epoch": 1.0, "learning_rate": 5.206059355080601e-06, "logits/chosen": -2.0905306339263916, "logits/rejected": -2.1129422187805176, "logps/chosen": -65.31855010986328, "logps/rejected": -37.624900817871094, "loss": 0.3987, "rewards/accuracies": 1.0, "rewards/chosen": 3.753293752670288, "rewards/margins": 0.5131187438964844, "rewards/rejected": 3.2401750087738037, "step": 4539 }, { "epoch": 1.0, "learning_rate": 5.204268536510997e-06, "logits/chosen": -1.8310761451721191, "logits/rejected": -1.8211798667907715, "logps/chosen": -96.90928649902344, "logps/rejected": -85.44498443603516, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": 6.248573303222656, "rewards/margins": 1.8420872688293457, "rewards/rejected": 4.4064860343933105, "step": 4540 }, { "epoch": 1.01, "learning_rate": 5.202477691693378e-06, "logits/chosen": -1.852866291999817, "logits/rejected": -1.852866291999817, "logps/chosen": -89.58425903320312, "logps/rejected": -89.58425903320312, "loss": 0.3496, "rewards/accuracies": 0.0, "rewards/chosen": 7.7687530517578125, "rewards/margins": 0.0, "rewards/rejected": 7.7687530517578125, "step": 4541 }, { "epoch": 1.01, "learning_rate": 5.200686820857862e-06, "logits/chosen": -1.6322922706604004, "logits/rejected": -1.5723670721054077, "logps/chosen": -116.60137939453125, "logps/rejected": -51.051307678222656, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": 7.638635158538818, "rewards/margins": 4.2519049644470215, "rewards/rejected": 3.386730194091797, "step": 4542 }, { "epoch": 1.01, "learning_rate": 5.198895924234573e-06, "logits/chosen": -1.5301401615142822, "logits/rejected": -1.5396625995635986, "logps/chosen": -48.3359260559082, "logps/rejected": -65.65706634521484, "loss": 0.1984, "rewards/accuracies": 1.0, "rewards/chosen": 3.4052531719207764, "rewards/margins": 0.8692188262939453, "rewards/rejected": 2.536034345626831, "step": 4543 }, { "epoch": 1.01, "learning_rate": 5.197105002053634e-06, "logits/chosen": -1.582443118095398, "logits/rejected": -1.4836004972457886, "logps/chosen": -64.84059143066406, "logps/rejected": -57.638885498046875, "loss": 0.516, "rewards/accuracies": 0.0, "rewards/chosen": 6.46492338180542, "rewards/margins": -0.06754493713378906, "rewards/rejected": 6.532468318939209, "step": 4544 }, { "epoch": 1.01, "learning_rate": 5.195314054545176e-06, "logits/chosen": -1.6006412506103516, "logits/rejected": -1.5928102731704712, "logps/chosen": -48.68269729614258, "logps/rejected": -101.39103698730469, "loss": 1.0112, "rewards/accuracies": 1.0, "rewards/chosen": 6.395480632781982, "rewards/margins": 0.03552722930908203, "rewards/rejected": 6.3599534034729, "step": 4545 }, { "epoch": 1.01, "learning_rate": 5.193523081939332e-06, "logits/chosen": -1.7209726572036743, "logits/rejected": -1.6704379320144653, "logps/chosen": -90.82135009765625, "logps/rejected": -106.25711059570312, "loss": 0.2662, "rewards/accuracies": 1.0, "rewards/chosen": 5.192099094390869, "rewards/margins": 1.4702668190002441, "rewards/rejected": 3.721832275390625, "step": 4546 }, { "epoch": 1.01, "learning_rate": 5.191732084466239e-06, "logits/chosen": -1.531522512435913, "logits/rejected": -1.531522512435913, "logps/chosen": -19.283212661743164, "logps/rejected": -19.283212661743164, "loss": 0.347, "rewards/accuracies": 0.0, "rewards/chosen": 2.4935431480407715, "rewards/margins": 0.0, "rewards/rejected": 2.4935431480407715, "step": 4547 }, { "epoch": 1.01, "learning_rate": 5.189941062356032e-06, "logits/chosen": -1.7156752347946167, "logits/rejected": -1.6198745965957642, "logps/chosen": -47.75090026855469, "logps/rejected": -18.824459075927734, "loss": 0.2551, "rewards/accuracies": 1.0, "rewards/chosen": 4.972201824188232, "rewards/margins": 3.1592867374420166, "rewards/rejected": 1.8129150867462158, "step": 4548 }, { "epoch": 1.01, "learning_rate": 5.188150015838855e-06, "logits/chosen": -2.0781161785125732, "logits/rejected": -2.005357265472412, "logps/chosen": -100.01766204833984, "logps/rejected": -92.45266723632812, "loss": 0.1335, "rewards/accuracies": 1.0, "rewards/chosen": 7.4099249839782715, "rewards/margins": 1.822110652923584, "rewards/rejected": 5.5878143310546875, "step": 4549 }, { "epoch": 1.01, "learning_rate": 5.186358945144855e-06, "logits/chosen": -1.8034777641296387, "logits/rejected": -1.6826404333114624, "logps/chosen": -93.06997680664062, "logps/rejected": -40.07909393310547, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 8.224029541015625, "rewards/margins": 5.399903297424316, "rewards/rejected": 2.8241264820098877, "step": 4550 }, { "epoch": 1.01, "learning_rate": 5.184567850504177e-06, "logits/chosen": -1.3355305194854736, "logits/rejected": -1.2173353433609009, "logps/chosen": -24.09016990661621, "logps/rejected": -3.974498987197876, "loss": 0.0974, "rewards/accuracies": 1.0, "rewards/chosen": 2.261549949645996, "rewards/margins": 1.5550860166549683, "rewards/rejected": 0.7064639329910278, "step": 4551 }, { "epoch": 1.01, "learning_rate": 5.182776732146974e-06, "logits/chosen": -1.7230201959609985, "logits/rejected": -1.7031879425048828, "logps/chosen": -33.775245666503906, "logps/rejected": -65.83782958984375, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": 3.1649818420410156, "rewards/margins": 1.440179467201233, "rewards/rejected": 1.7248023748397827, "step": 4552 }, { "epoch": 1.01, "learning_rate": 5.180985590303401e-06, "logits/chosen": -1.5966215133666992, "logits/rejected": -1.5007668733596802, "logps/chosen": -48.9912109375, "logps/rejected": -6.9627766609191895, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 6.07512903213501, "rewards/margins": 5.254552364349365, "rewards/rejected": 0.8205768465995789, "step": 4553 }, { "epoch": 1.01, "learning_rate": 5.179194425203614e-06, "logits/chosen": -1.8015451431274414, "logits/rejected": -1.765425205230713, "logps/chosen": -73.83843231201172, "logps/rejected": -70.2214126586914, "loss": 0.3428, "rewards/accuracies": 1.0, "rewards/chosen": 5.387662410736084, "rewards/margins": 0.03658103942871094, "rewards/rejected": 5.351081371307373, "step": 4554 }, { "epoch": 1.01, "learning_rate": 5.177403237077775e-06, "logits/chosen": -1.6900476217269897, "logits/rejected": -1.4365599155426025, "logps/chosen": -63.104286193847656, "logps/rejected": -122.08880615234375, "loss": 1.4666, "rewards/accuracies": 0.0, "rewards/chosen": 3.1667802333831787, "rewards/margins": -2.8422162532806396, "rewards/rejected": 6.008996486663818, "step": 4555 }, { "epoch": 1.01, "learning_rate": 5.175612026156045e-06, "logits/chosen": -1.875251293182373, "logits/rejected": -1.8183761835098267, "logps/chosen": -99.95375061035156, "logps/rejected": -53.34303665161133, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": 5.749151706695557, "rewards/margins": 2.3732190132141113, "rewards/rejected": 3.3759326934814453, "step": 4556 }, { "epoch": 1.01, "learning_rate": 5.173820792668593e-06, "logits/chosen": -1.585783839225769, "logits/rejected": -1.592871069908142, "logps/chosen": -62.696773529052734, "logps/rejected": -39.74285888671875, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 4.346344470977783, "rewards/margins": 0.5779194831848145, "rewards/rejected": 3.7684249877929688, "step": 4557 }, { "epoch": 1.01, "learning_rate": 5.172029536845587e-06, "logits/chosen": -1.5364247560501099, "logits/rejected": -1.5524955987930298, "logps/chosen": -39.730995178222656, "logps/rejected": -82.84980010986328, "loss": 0.2996, "rewards/accuracies": 1.0, "rewards/chosen": 4.626847267150879, "rewards/margins": 0.2903289794921875, "rewards/rejected": 4.336518287658691, "step": 4558 }, { "epoch": 1.01, "learning_rate": 5.170238258917201e-06, "logits/chosen": -1.6960484981536865, "logits/rejected": -1.5636818408966064, "logps/chosen": -121.04879760742188, "logps/rejected": -48.08365249633789, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 6.673800945281982, "rewards/margins": 4.420645713806152, "rewards/rejected": 2.25315523147583, "step": 4559 }, { "epoch": 1.01, "learning_rate": 5.168446959113604e-06, "logits/chosen": -2.0001041889190674, "logits/rejected": -1.9312204122543335, "logps/chosen": -125.9002914428711, "logps/rejected": -37.0838623046875, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 6.2275004386901855, "rewards/margins": 4.028923034667969, "rewards/rejected": 2.198577642440796, "step": 4560 }, { "epoch": 1.01, "learning_rate": 5.16665563766498e-06, "logits/chosen": -1.9978065490722656, "logits/rejected": -1.9420186281204224, "logps/chosen": -58.93883514404297, "logps/rejected": -62.96845245361328, "loss": 0.1932, "rewards/accuracies": 1.0, "rewards/chosen": 3.4762017726898193, "rewards/margins": 1.7659424543380737, "rewards/rejected": 1.7102593183517456, "step": 4561 }, { "epoch": 1.01, "learning_rate": 5.164864294801507e-06, "logits/chosen": -1.9097503423690796, "logits/rejected": -1.7598780393600464, "logps/chosen": -149.68798828125, "logps/rejected": -43.83750915527344, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 6.482632637023926, "rewards/margins": 3.2108757495880127, "rewards/rejected": 3.271756887435913, "step": 4562 }, { "epoch": 1.01, "learning_rate": 5.163072930753368e-06, "logits/chosen": -1.7599283456802368, "logits/rejected": -1.6193996667861938, "logps/chosen": -78.81649780273438, "logps/rejected": -26.95113182067871, "loss": 0.0649, "rewards/accuracies": 1.0, "rewards/chosen": 5.035643100738525, "rewards/margins": 4.632808685302734, "rewards/rejected": 0.4028345048427582, "step": 4563 }, { "epoch": 1.01, "learning_rate": 5.161281545750749e-06, "logits/chosen": -1.577139139175415, "logits/rejected": -1.5657501220703125, "logps/chosen": -52.611351013183594, "logps/rejected": -51.27629089355469, "loss": 0.1998, "rewards/accuracies": 1.0, "rewards/chosen": 3.5274055004119873, "rewards/margins": 0.8007285594940186, "rewards/rejected": 2.7266769409179688, "step": 4564 }, { "epoch": 1.01, "learning_rate": 5.1594901400238394e-06, "logits/chosen": -1.713981032371521, "logits/rejected": -1.7180544137954712, "logps/chosen": -88.10677337646484, "logps/rejected": -146.54876708984375, "loss": 0.5986, "rewards/accuracies": 0.0, "rewards/chosen": 8.995566368103027, "rewards/margins": -0.7171659469604492, "rewards/rejected": 9.712732315063477, "step": 4565 }, { "epoch": 1.01, "learning_rate": 5.157698713802831e-06, "logits/chosen": -1.425286054611206, "logits/rejected": -1.2877758741378784, "logps/chosen": -55.204288482666016, "logps/rejected": -37.362579345703125, "loss": 0.1691, "rewards/accuracies": 1.0, "rewards/chosen": 5.46887731552124, "rewards/margins": 3.0063273906707764, "rewards/rejected": 2.462549924850464, "step": 4566 }, { "epoch": 1.01, "learning_rate": 5.155907267317916e-06, "logits/chosen": -1.8851655721664429, "logits/rejected": -1.6980621814727783, "logps/chosen": -143.1493682861328, "logps/rejected": -106.60363006591797, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 7.146790981292725, "rewards/margins": 4.546387672424316, "rewards/rejected": 2.6004035472869873, "step": 4567 }, { "epoch": 1.01, "learning_rate": 5.154115800799294e-06, "logits/chosen": -1.8450236320495605, "logits/rejected": -1.7418246269226074, "logps/chosen": -95.66539001464844, "logps/rejected": -87.5543441772461, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": 6.949528694152832, "rewards/margins": 2.6481680870056152, "rewards/rejected": 4.301360607147217, "step": 4568 }, { "epoch": 1.01, "learning_rate": 5.15232431447716e-06, "logits/chosen": -1.682166576385498, "logits/rejected": -1.722703456878662, "logps/chosen": -31.52885627746582, "logps/rejected": -39.87145233154297, "loss": 2.7504, "rewards/accuracies": 0.0, "rewards/chosen": 2.609825372695923, "rewards/margins": -4.41005802154541, "rewards/rejected": 7.019883632659912, "step": 4569 }, { "epoch": 1.01, "learning_rate": 5.150532808581718e-06, "logits/chosen": -1.3968790769577026, "logits/rejected": -1.3663547039031982, "logps/chosen": -52.29859924316406, "logps/rejected": -105.87908935546875, "loss": 0.1444, "rewards/accuracies": 1.0, "rewards/chosen": 5.552992343902588, "rewards/margins": 1.1163344383239746, "rewards/rejected": 4.436657905578613, "step": 4570 }, { "epoch": 1.01, "learning_rate": 5.1487412833431715e-06, "logits/chosen": -1.8956141471862793, "logits/rejected": -1.8119415044784546, "logps/chosen": -86.4576416015625, "logps/rejected": -61.07954406738281, "loss": 0.0749, "rewards/accuracies": 1.0, "rewards/chosen": 5.892724514007568, "rewards/margins": 2.0943517684936523, "rewards/rejected": 3.798372745513916, "step": 4571 }, { "epoch": 1.01, "learning_rate": 5.146949738991729e-06, "logits/chosen": -2.0262296199798584, "logits/rejected": -2.01340389251709, "logps/chosen": -107.55982971191406, "logps/rejected": -48.5882568359375, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 6.484452724456787, "rewards/margins": 2.5085737705230713, "rewards/rejected": 3.975878953933716, "step": 4572 }, { "epoch": 1.01, "learning_rate": 5.145158175757597e-06, "logits/chosen": -1.5522575378417969, "logits/rejected": -1.4460567235946655, "logps/chosen": -51.811302185058594, "logps/rejected": -48.32012939453125, "loss": 0.238, "rewards/accuracies": 1.0, "rewards/chosen": 3.6881492137908936, "rewards/margins": 1.5996694564819336, "rewards/rejected": 2.08847975730896, "step": 4573 }, { "epoch": 1.01, "learning_rate": 5.14336659387099e-06, "logits/chosen": -1.9374029636383057, "logits/rejected": -1.9101147651672363, "logps/chosen": -55.50360107421875, "logps/rejected": -77.48423767089844, "loss": 0.2535, "rewards/accuracies": 1.0, "rewards/chosen": 3.1312692165374756, "rewards/margins": 0.6132669448852539, "rewards/rejected": 2.5180022716522217, "step": 4574 }, { "epoch": 1.01, "learning_rate": 5.141574993562119e-06, "logits/chosen": -1.55579674243927, "logits/rejected": -1.56992506980896, "logps/chosen": -28.538990020751953, "logps/rejected": -31.280595779418945, "loss": 0.8286, "rewards/accuracies": 0.0, "rewards/chosen": 3.5620434284210205, "rewards/margins": -0.26001691818237305, "rewards/rejected": 3.8220603466033936, "step": 4575 }, { "epoch": 1.01, "learning_rate": 5.139783375061201e-06, "logits/chosen": -2.1562607288360596, "logits/rejected": -2.1650214195251465, "logps/chosen": -63.900917053222656, "logps/rejected": -25.21111488342285, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": 6.930721282958984, "rewards/margins": 3.973351001739502, "rewards/rejected": 2.9573702812194824, "step": 4576 }, { "epoch": 1.01, "learning_rate": 5.137991738598457e-06, "logits/chosen": -1.5522652864456177, "logits/rejected": -1.6975089311599731, "logps/chosen": -65.07266235351562, "logps/rejected": -175.74301147460938, "loss": 1.7593, "rewards/accuracies": 0.0, "rewards/chosen": 4.791326999664307, "rewards/margins": -3.461556911468506, "rewards/rejected": 8.252883911132812, "step": 4577 }, { "epoch": 1.01, "learning_rate": 5.1362000844041075e-06, "logits/chosen": -1.5533804893493652, "logits/rejected": -1.5849930047988892, "logps/chosen": -13.034911155700684, "logps/rejected": -44.780181884765625, "loss": 1.3446, "rewards/accuracies": 0.0, "rewards/chosen": 1.2297037839889526, "rewards/margins": -1.5037175416946411, "rewards/rejected": 2.7334213256835938, "step": 4578 }, { "epoch": 1.01, "learning_rate": 5.134408412708372e-06, "logits/chosen": -1.6095316410064697, "logits/rejected": -1.462897539138794, "logps/chosen": -56.14771270751953, "logps/rejected": -47.113365173339844, "loss": 0.1481, "rewards/accuracies": 1.0, "rewards/chosen": 5.189162731170654, "rewards/margins": 2.4041597843170166, "rewards/rejected": 2.7850029468536377, "step": 4579 }, { "epoch": 1.01, "learning_rate": 5.132616723741478e-06, "logits/chosen": -1.7774367332458496, "logits/rejected": -1.592012643814087, "logps/chosen": -57.172950744628906, "logps/rejected": -12.075721740722656, "loss": 0.0612, "rewards/accuracies": 1.0, "rewards/chosen": 3.4763572216033936, "rewards/margins": 2.5246355533599854, "rewards/rejected": 0.9517216086387634, "step": 4580 }, { "epoch": 1.01, "learning_rate": 5.130825017733656e-06, "logits/chosen": -1.7536484003067017, "logits/rejected": -1.557428240776062, "logps/chosen": -118.30684661865234, "logps/rejected": -28.536161422729492, "loss": 0.3955, "rewards/accuracies": 1.0, "rewards/chosen": 5.568360328674316, "rewards/margins": 3.2333121299743652, "rewards/rejected": 2.335048198699951, "step": 4581 }, { "epoch": 1.01, "learning_rate": 5.129033294915132e-06, "logits/chosen": -1.8200267553329468, "logits/rejected": -1.795601725578308, "logps/chosen": -65.60132598876953, "logps/rejected": -87.44526672363281, "loss": 0.4647, "rewards/accuracies": 1.0, "rewards/chosen": 4.0376458168029785, "rewards/margins": 1.3220405578613281, "rewards/rejected": 2.7156052589416504, "step": 4582 }, { "epoch": 1.01, "learning_rate": 5.127241555516141e-06, "logits/chosen": -1.846781849861145, "logits/rejected": -1.8952033519744873, "logps/chosen": -34.88216018676758, "logps/rejected": -123.8864974975586, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 3.1687495708465576, "rewards/margins": 1.9516329765319824, "rewards/rejected": 1.2171165943145752, "step": 4583 }, { "epoch": 1.01, "learning_rate": 5.125449799766916e-06, "logits/chosen": -1.7702014446258545, "logits/rejected": -1.7078156471252441, "logps/chosen": -179.45628356933594, "logps/rejected": -95.01437377929688, "loss": 0.0909, "rewards/accuracies": 1.0, "rewards/chosen": 8.548079490661621, "rewards/margins": 2.010798931121826, "rewards/rejected": 6.537280559539795, "step": 4584 }, { "epoch": 1.01, "learning_rate": 5.123658027897692e-06, "logits/chosen": -1.8437488079071045, "logits/rejected": -1.920204997062683, "logps/chosen": -117.8251724243164, "logps/rejected": -229.1414794921875, "loss": 0.1816, "rewards/accuracies": 1.0, "rewards/chosen": 13.104486465454102, "rewards/margins": 0.8506536483764648, "rewards/rejected": 12.253832817077637, "step": 4585 }, { "epoch": 1.02, "learning_rate": 5.12186624013871e-06, "logits/chosen": -2.0283823013305664, "logits/rejected": -1.9713670015335083, "logps/chosen": -82.47398376464844, "logps/rejected": -106.84259033203125, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 8.691558837890625, "rewards/margins": 2.979292392730713, "rewards/rejected": 5.712266445159912, "step": 4586 }, { "epoch": 1.02, "learning_rate": 5.120074436720208e-06, "logits/chosen": -1.5509659051895142, "logits/rejected": -1.503398060798645, "logps/chosen": -49.29216766357422, "logps/rejected": -35.352394104003906, "loss": 1.0169, "rewards/accuracies": 1.0, "rewards/chosen": 3.4321205615997314, "rewards/margins": 0.6558067798614502, "rewards/rejected": 2.7763137817382812, "step": 4587 }, { "epoch": 1.02, "learning_rate": 5.118282617872432e-06, "logits/chosen": -1.8510982990264893, "logits/rejected": -1.6808301210403442, "logps/chosen": -119.37907409667969, "logps/rejected": -12.754932403564453, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 5.964573860168457, "rewards/margins": 4.469136714935303, "rewards/rejected": 1.4954372644424438, "step": 4588 }, { "epoch": 1.02, "learning_rate": 5.116490783825622e-06, "logits/chosen": -1.7247146368026733, "logits/rejected": -1.6651809215545654, "logps/chosen": -51.217411041259766, "logps/rejected": -44.2320442199707, "loss": 0.2906, "rewards/accuracies": 1.0, "rewards/chosen": 3.554028034210205, "rewards/margins": 0.44875335693359375, "rewards/rejected": 3.1052746772766113, "step": 4589 }, { "epoch": 1.02, "learning_rate": 5.114698934810026e-06, "logits/chosen": -1.9708545207977295, "logits/rejected": -1.8837894201278687, "logps/chosen": -55.34779357910156, "logps/rejected": -53.374046325683594, "loss": 0.1098, "rewards/accuracies": 1.0, "rewards/chosen": 3.9424896240234375, "rewards/margins": 2.7433524131774902, "rewards/rejected": 1.1991370916366577, "step": 4590 }, { "epoch": 1.02, "learning_rate": 5.112907071055893e-06, "logits/chosen": -1.7201249599456787, "logits/rejected": -1.6133822202682495, "logps/chosen": -67.55299377441406, "logps/rejected": -23.60696029663086, "loss": 0.2496, "rewards/accuracies": 1.0, "rewards/chosen": 2.208477735519409, "rewards/margins": 1.1768580675125122, "rewards/rejected": 1.031619668006897, "step": 4591 }, { "epoch": 1.02, "learning_rate": 5.1111151927934735e-06, "logits/chosen": -1.8526465892791748, "logits/rejected": -1.901145577430725, "logps/chosen": -63.44744873046875, "logps/rejected": -112.86793518066406, "loss": 0.0622, "rewards/accuracies": 1.0, "rewards/chosen": 7.551797389984131, "rewards/margins": 2.0750365257263184, "rewards/rejected": 5.4767608642578125, "step": 4592 }, { "epoch": 1.02, "learning_rate": 5.109323300253019e-06, "logits/chosen": -1.5469369888305664, "logits/rejected": -1.4855128526687622, "logps/chosen": -142.76528930664062, "logps/rejected": -46.69782257080078, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 11.536419868469238, "rewards/margins": 4.0052924156188965, "rewards/rejected": 7.531127452850342, "step": 4593 }, { "epoch": 1.02, "learning_rate": 5.107531393664781e-06, "logits/chosen": -1.5044068098068237, "logits/rejected": -1.470340609550476, "logps/chosen": -71.88646697998047, "logps/rejected": -71.01243591308594, "loss": 0.5218, "rewards/accuracies": 0.0, "rewards/chosen": 4.649878978729248, "rewards/margins": -0.4785733222961426, "rewards/rejected": 5.128452301025391, "step": 4594 }, { "epoch": 1.02, "learning_rate": 5.105739473259019e-06, "logits/chosen": -1.5600281953811646, "logits/rejected": -1.45327889919281, "logps/chosen": -86.41375732421875, "logps/rejected": -55.85749816894531, "loss": 2.1683, "rewards/accuracies": 1.0, "rewards/chosen": 3.7773849964141846, "rewards/margins": 3.0681099891662598, "rewards/rejected": 0.7092750668525696, "step": 4595 }, { "epoch": 1.02, "learning_rate": 5.103947539265989e-06, "logits/chosen": -1.5817556381225586, "logits/rejected": -1.5441337823867798, "logps/chosen": -27.808780670166016, "logps/rejected": -54.857887268066406, "loss": 0.5594, "rewards/accuracies": 1.0, "rewards/chosen": 2.0654308795928955, "rewards/margins": 0.271572470664978, "rewards/rejected": 1.7938584089279175, "step": 4596 }, { "epoch": 1.02, "learning_rate": 5.10215559191595e-06, "logits/chosen": -1.680320143699646, "logits/rejected": -1.3295758962631226, "logps/chosen": -105.94371032714844, "logps/rejected": -88.46270751953125, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 8.864031791687012, "rewards/margins": 3.5605344772338867, "rewards/rejected": 5.303497314453125, "step": 4597 }, { "epoch": 1.02, "learning_rate": 5.100363631439162e-06, "logits/chosen": -1.9077081680297852, "logits/rejected": -1.817514419555664, "logps/chosen": -66.22103881835938, "logps/rejected": -83.28919219970703, "loss": 0.0647, "rewards/accuracies": 1.0, "rewards/chosen": 8.744180679321289, "rewards/margins": 2.0208382606506348, "rewards/rejected": 6.723342418670654, "step": 4598 }, { "epoch": 1.02, "learning_rate": 5.098571658065889e-06, "logits/chosen": -1.8254623413085938, "logits/rejected": -1.8237240314483643, "logps/chosen": -70.83293914794922, "logps/rejected": -48.144290924072266, "loss": 0.2931, "rewards/accuracies": 1.0, "rewards/chosen": 4.795657634735107, "rewards/margins": 0.3687605857849121, "rewards/rejected": 4.426897048950195, "step": 4599 }, { "epoch": 1.02, "learning_rate": 5.096779672026397e-06, "logits/chosen": -1.5815294981002808, "logits/rejected": -1.5717394351959229, "logps/chosen": -34.33399963378906, "logps/rejected": -27.93527603149414, "loss": 0.2607, "rewards/accuracies": 1.0, "rewards/chosen": 3.814748525619507, "rewards/margins": 1.1423313617706299, "rewards/rejected": 2.672417163848877, "step": 4600 }, { "epoch": 1.02, "learning_rate": 5.094987673550948e-06, "logits/chosen": -1.7583829164505005, "logits/rejected": -1.769717812538147, "logps/chosen": -60.044193267822266, "logps/rejected": -66.38678741455078, "loss": 0.5168, "rewards/accuracies": 0.0, "rewards/chosen": 4.181047439575195, "rewards/margins": -0.5889501571655273, "rewards/rejected": 4.769997596740723, "step": 4601 }, { "epoch": 1.02, "learning_rate": 5.093195662869811e-06, "logits/chosen": -1.7958630323410034, "logits/rejected": -1.785123348236084, "logps/chosen": -28.159269332885742, "logps/rejected": -62.16814422607422, "loss": 1.445, "rewards/accuracies": 0.0, "rewards/chosen": 1.8484888076782227, "rewards/margins": -0.8929517269134521, "rewards/rejected": 2.741440534591675, "step": 4602 }, { "epoch": 1.02, "learning_rate": 5.091403640213256e-06, "logits/chosen": -1.6840901374816895, "logits/rejected": -1.7053886651992798, "logps/chosen": -56.71357727050781, "logps/rejected": -48.72089767456055, "loss": 0.2341, "rewards/accuracies": 1.0, "rewards/chosen": 3.645916700363159, "rewards/margins": 0.8007717132568359, "rewards/rejected": 2.8451449871063232, "step": 4603 }, { "epoch": 1.02, "learning_rate": 5.089611605811552e-06, "logits/chosen": -1.7491651773452759, "logits/rejected": -1.6199278831481934, "logps/chosen": -65.44009399414062, "logps/rejected": -8.97143268585205, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": 2.775343418121338, "rewards/margins": 2.204787254333496, "rewards/rejected": 0.570556104183197, "step": 4604 }, { "epoch": 1.02, "learning_rate": 5.0878195598949735e-06, "logits/chosen": -1.6497461795806885, "logits/rejected": -1.6497461795806885, "logps/chosen": -51.95454406738281, "logps/rejected": -51.95454406738281, "loss": 0.4232, "rewards/accuracies": 0.0, "rewards/chosen": 4.978828430175781, "rewards/margins": 0.0, "rewards/rejected": 4.978828430175781, "step": 4605 }, { "epoch": 1.02, "learning_rate": 5.086027502693791e-06, "logits/chosen": -1.741635799407959, "logits/rejected": -1.71406090259552, "logps/chosen": -78.3889389038086, "logps/rejected": -78.87158203125, "loss": 0.7326, "rewards/accuracies": 1.0, "rewards/chosen": 5.271737098693848, "rewards/margins": 1.9876000881195068, "rewards/rejected": 3.284137010574341, "step": 4606 }, { "epoch": 1.02, "learning_rate": 5.084235434438283e-06, "logits/chosen": -1.6196562051773071, "logits/rejected": -1.5133132934570312, "logps/chosen": -39.85689926147461, "logps/rejected": -129.44613647460938, "loss": 0.5262, "rewards/accuracies": 0.0, "rewards/chosen": 4.744058609008789, "rewards/margins": -0.40539073944091797, "rewards/rejected": 5.149449348449707, "step": 4607 }, { "epoch": 1.02, "learning_rate": 5.082443355358724e-06, "logits/chosen": -2.0050456523895264, "logits/rejected": -1.9458434581756592, "logps/chosen": -82.42251586914062, "logps/rejected": -31.228248596191406, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": 7.4595184326171875, "rewards/margins": 4.191259384155273, "rewards/rejected": 3.268258810043335, "step": 4608 }, { "epoch": 1.02, "learning_rate": 5.080651265685392e-06, "logits/chosen": -1.680793046951294, "logits/rejected": -1.6180669069290161, "logps/chosen": -48.549835205078125, "logps/rejected": -172.66432189941406, "loss": 2.9996, "rewards/accuracies": 0.0, "rewards/chosen": 4.329376220703125, "rewards/margins": -5.2913713455200195, "rewards/rejected": 9.620747566223145, "step": 4609 }, { "epoch": 1.02, "learning_rate": 5.078859165648565e-06, "logits/chosen": -1.724530816078186, "logits/rejected": -1.7601457834243774, "logps/chosen": -57.5039176940918, "logps/rejected": -100.8487548828125, "loss": 0.8436, "rewards/accuracies": 0.0, "rewards/chosen": 5.287655353546143, "rewards/margins": -1.4792513847351074, "rewards/rejected": 6.76690673828125, "step": 4610 }, { "epoch": 1.02, "learning_rate": 5.077067055478525e-06, "logits/chosen": -1.775942087173462, "logits/rejected": -1.7693313360214233, "logps/chosen": -43.822052001953125, "logps/rejected": -86.47903442382812, "loss": 0.3761, "rewards/accuracies": 1.0, "rewards/chosen": 3.952256917953491, "rewards/margins": 2.8010880947113037, "rewards/rejected": 1.1511688232421875, "step": 4611 }, { "epoch": 1.02, "learning_rate": 5.075274935405554e-06, "logits/chosen": -1.6957701444625854, "logits/rejected": -1.6103476285934448, "logps/chosen": -53.903526306152344, "logps/rejected": -55.82294845581055, "loss": 0.2774, "rewards/accuracies": 1.0, "rewards/chosen": 4.830426216125488, "rewards/margins": 1.0691251754760742, "rewards/rejected": 3.761301040649414, "step": 4612 }, { "epoch": 1.02, "learning_rate": 5.073482805659935e-06, "logits/chosen": -1.7142131328582764, "logits/rejected": -1.5544886589050293, "logps/chosen": -100.91133117675781, "logps/rejected": -77.05269622802734, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 5.696669101715088, "rewards/margins": 3.5674431324005127, "rewards/rejected": 2.129225969314575, "step": 4613 }, { "epoch": 1.02, "learning_rate": 5.071690666471951e-06, "logits/chosen": -1.8846532106399536, "logits/rejected": -1.8711029291152954, "logps/chosen": -99.84931945800781, "logps/rejected": -119.89775848388672, "loss": 0.0663, "rewards/accuracies": 1.0, "rewards/chosen": 8.860574722290039, "rewards/margins": 1.9682307243347168, "rewards/rejected": 6.892343997955322, "step": 4614 }, { "epoch": 1.02, "learning_rate": 5.06989851807189e-06, "logits/chosen": -1.9439712762832642, "logits/rejected": -1.9633252620697021, "logps/chosen": -49.01080322265625, "logps/rejected": -55.09950637817383, "loss": 0.087, "rewards/accuracies": 1.0, "rewards/chosen": 4.761336803436279, "rewards/margins": 1.760608434677124, "rewards/rejected": 3.0007283687591553, "step": 4615 }, { "epoch": 1.02, "learning_rate": 5.068106360690038e-06, "logits/chosen": -1.5145511627197266, "logits/rejected": -1.5169661045074463, "logps/chosen": -54.55019760131836, "logps/rejected": -73.1633071899414, "loss": 0.3538, "rewards/accuracies": 0.0, "rewards/chosen": 3.654515504837036, "rewards/margins": -0.022376537322998047, "rewards/rejected": 3.676892042160034, "step": 4616 }, { "epoch": 1.02, "learning_rate": 5.066314194556682e-06, "logits/chosen": -1.9260947704315186, "logits/rejected": -1.8328518867492676, "logps/chosen": -199.56434631347656, "logps/rejected": -58.45499801635742, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 7.929271221160889, "rewards/margins": 5.544328212738037, "rewards/rejected": 2.3849430084228516, "step": 4617 }, { "epoch": 1.02, "learning_rate": 5.06452201990211e-06, "logits/chosen": -1.8474570512771606, "logits/rejected": -1.922416090965271, "logps/chosen": -22.67767333984375, "logps/rejected": -133.99118041992188, "loss": 0.6666, "rewards/accuracies": 0.0, "rewards/chosen": 4.203822612762451, "rewards/margins": -0.9488859176635742, "rewards/rejected": 5.152708530426025, "step": 4618 }, { "epoch": 1.02, "learning_rate": 5.062729836956616e-06, "logits/chosen": -1.9355050325393677, "logits/rejected": -1.909730315208435, "logps/chosen": -94.50717163085938, "logps/rejected": -127.6611328125, "loss": 0.0596, "rewards/accuracies": 1.0, "rewards/chosen": 10.998178482055664, "rewards/margins": 2.0734376907348633, "rewards/rejected": 8.9247407913208, "step": 4619 }, { "epoch": 1.02, "learning_rate": 5.060937645950486e-06, "logits/chosen": -1.692095160484314, "logits/rejected": -1.5946650505065918, "logps/chosen": -20.973318099975586, "logps/rejected": -17.032373428344727, "loss": 0.0705, "rewards/accuracies": 1.0, "rewards/chosen": 3.4035720825195312, "rewards/margins": 1.9141004085540771, "rewards/rejected": 1.489471673965454, "step": 4620 }, { "epoch": 1.02, "learning_rate": 5.059145447114016e-06, "logits/chosen": -1.808753252029419, "logits/rejected": -1.796431064605713, "logps/chosen": -97.83187103271484, "logps/rejected": -118.18720245361328, "loss": 0.388, "rewards/accuracies": 1.0, "rewards/chosen": 7.511506080627441, "rewards/margins": 0.02685546875, "rewards/rejected": 7.484650611877441, "step": 4621 }, { "epoch": 1.02, "learning_rate": 5.057353240677498e-06, "logits/chosen": -1.584882378578186, "logits/rejected": -1.584882378578186, "logps/chosen": -32.499298095703125, "logps/rejected": -32.499298095703125, "loss": 0.4045, "rewards/accuracies": 0.0, "rewards/chosen": 5.70888090133667, "rewards/margins": 0.0, "rewards/rejected": 5.70888090133667, "step": 4622 }, { "epoch": 1.02, "learning_rate": 5.055561026871228e-06, "logits/chosen": -1.845815896987915, "logits/rejected": -1.754435658454895, "logps/chosen": -82.24261474609375, "logps/rejected": -97.40099334716797, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 8.3294038772583, "rewards/margins": 3.6175055503845215, "rewards/rejected": 4.711898326873779, "step": 4623 }, { "epoch": 1.02, "learning_rate": 5.053768805925498e-06, "logits/chosen": -1.789829134941101, "logits/rejected": -1.6154308319091797, "logps/chosen": -96.49461364746094, "logps/rejected": -46.81117630004883, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 6.554532051086426, "rewards/margins": 4.439629077911377, "rewards/rejected": 2.114902973175049, "step": 4624 }, { "epoch": 1.02, "learning_rate": 5.051976578070607e-06, "logits/chosen": -1.8448063135147095, "logits/rejected": -1.7059271335601807, "logps/chosen": -111.36433410644531, "logps/rejected": -31.84247589111328, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 6.604045391082764, "rewards/margins": 2.615648031234741, "rewards/rejected": 3.9883973598480225, "step": 4625 }, { "epoch": 1.02, "learning_rate": 5.0501843435368495e-06, "logits/chosen": -1.922118902206421, "logits/rejected": -1.922118902206421, "logps/chosen": -64.23680114746094, "logps/rejected": -64.23680114746094, "loss": 0.3504, "rewards/accuracies": 0.0, "rewards/chosen": 4.671860694885254, "rewards/margins": 0.0, "rewards/rejected": 4.671860694885254, "step": 4626 }, { "epoch": 1.02, "learning_rate": 5.048392102554525e-06, "logits/chosen": -1.8210588693618774, "logits/rejected": -1.8210588693618774, "logps/chosen": -49.20802307128906, "logps/rejected": -49.20802307128906, "loss": 0.4022, "rewards/accuracies": 0.0, "rewards/chosen": 6.348336219787598, "rewards/margins": 0.0, "rewards/rejected": 6.348336219787598, "step": 4627 }, { "epoch": 1.02, "learning_rate": 5.046599855353931e-06, "logits/chosen": -1.9274606704711914, "logits/rejected": -1.3499890565872192, "logps/chosen": -54.97340393066406, "logps/rejected": -194.48556518554688, "loss": 2.3016, "rewards/accuracies": 0.0, "rewards/chosen": 4.067076206207275, "rewards/margins": -4.497959613800049, "rewards/rejected": 8.565035820007324, "step": 4628 }, { "epoch": 1.02, "learning_rate": 5.044807602165368e-06, "logits/chosen": -1.6815403699874878, "logits/rejected": -1.6536188125610352, "logps/chosen": -22.587886810302734, "logps/rejected": -52.160919189453125, "loss": 0.6109, "rewards/accuracies": 0.0, "rewards/chosen": 3.4557225704193115, "rewards/margins": -0.860276460647583, "rewards/rejected": 4.3159990310668945, "step": 4629 }, { "epoch": 1.02, "learning_rate": 5.043015343219137e-06, "logits/chosen": -1.86384916305542, "logits/rejected": -1.9775846004486084, "logps/chosen": -43.647361755371094, "logps/rejected": -107.06796264648438, "loss": 2.4323, "rewards/accuracies": 0.0, "rewards/chosen": 6.147336483001709, "rewards/margins": -4.855867862701416, "rewards/rejected": 11.003204345703125, "step": 4630 }, { "epoch": 1.03, "learning_rate": 5.041223078745535e-06, "logits/chosen": -1.8764526844024658, "logits/rejected": -1.8607392311096191, "logps/chosen": -52.22528839111328, "logps/rejected": -66.57624816894531, "loss": 0.7748, "rewards/accuracies": 0.0, "rewards/chosen": 3.0944221019744873, "rewards/margins": -1.2882516384124756, "rewards/rejected": 4.382673740386963, "step": 4631 }, { "epoch": 1.03, "learning_rate": 5.039430808974869e-06, "logits/chosen": -2.134352207183838, "logits/rejected": -2.1453330516815186, "logps/chosen": -22.741718292236328, "logps/rejected": -66.13379669189453, "loss": 0.4573, "rewards/accuracies": 0.0, "rewards/chosen": 3.199937105178833, "rewards/margins": -0.35471463203430176, "rewards/rejected": 3.5546517372131348, "step": 4632 }, { "epoch": 1.03, "learning_rate": 5.037638534137437e-06, "logits/chosen": -1.842206358909607, "logits/rejected": -1.842684268951416, "logps/chosen": -42.54328155517578, "logps/rejected": -73.94699096679688, "loss": 0.2374, "rewards/accuracies": 1.0, "rewards/chosen": 3.3278253078460693, "rewards/margins": 0.5138306617736816, "rewards/rejected": 2.8139946460723877, "step": 4633 }, { "epoch": 1.03, "learning_rate": 5.035846254463546e-06, "logits/chosen": -1.830693244934082, "logits/rejected": -1.7895556688308716, "logps/chosen": -68.54651641845703, "logps/rejected": -18.77663230895996, "loss": 0.3731, "rewards/accuracies": 1.0, "rewards/chosen": 3.3660666942596436, "rewards/margins": 2.908602237701416, "rewards/rejected": 0.45746442675590515, "step": 4634 }, { "epoch": 1.03, "learning_rate": 5.0340539701834965e-06, "logits/chosen": -1.9813261032104492, "logits/rejected": -1.9107377529144287, "logps/chosen": -88.30268859863281, "logps/rejected": -64.43124389648438, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": 8.040234565734863, "rewards/margins": 4.042273044586182, "rewards/rejected": 3.9979615211486816, "step": 4635 }, { "epoch": 1.03, "learning_rate": 5.0322616815275926e-06, "logits/chosen": -2.109532356262207, "logits/rejected": -2.135591745376587, "logps/chosen": -67.99159240722656, "logps/rejected": -87.7286605834961, "loss": 0.2916, "rewards/accuracies": 1.0, "rewards/chosen": 4.12715482711792, "rewards/margins": 0.3471949100494385, "rewards/rejected": 3.7799599170684814, "step": 4636 }, { "epoch": 1.03, "learning_rate": 5.030469388726142e-06, "logits/chosen": -1.920828104019165, "logits/rejected": -1.9377208948135376, "logps/chosen": -29.897254943847656, "logps/rejected": -89.34486389160156, "loss": 0.4003, "rewards/accuracies": 0.0, "rewards/chosen": 4.0825514793396, "rewards/margins": -0.16107511520385742, "rewards/rejected": 4.243626594543457, "step": 4637 }, { "epoch": 1.03, "learning_rate": 5.028677092009446e-06, "logits/chosen": -1.3209625482559204, "logits/rejected": -1.261698603630066, "logps/chosen": -19.167016983032227, "logps/rejected": -1.459100365638733, "loss": 0.5472, "rewards/accuracies": 1.0, "rewards/chosen": 1.367842674255371, "rewards/margins": 0.6328189969062805, "rewards/rejected": 0.7350236773490906, "step": 4638 }, { "epoch": 1.03, "learning_rate": 5.026884791607815e-06, "logits/chosen": -1.6644951105117798, "logits/rejected": -1.6557728052139282, "logps/chosen": -25.319673538208008, "logps/rejected": -59.597389221191406, "loss": 0.6363, "rewards/accuracies": 0.0, "rewards/chosen": 3.077029228210449, "rewards/margins": -0.8567607402801514, "rewards/rejected": 3.9337899684906006, "step": 4639 }, { "epoch": 1.03, "learning_rate": 5.025092487751552e-06, "logits/chosen": -2.030747175216675, "logits/rejected": -2.0831198692321777, "logps/chosen": -44.862396240234375, "logps/rejected": -97.16669464111328, "loss": 0.1729, "rewards/accuracies": 1.0, "rewards/chosen": 7.661979675292969, "rewards/margins": 1.462277889251709, "rewards/rejected": 6.19970178604126, "step": 4640 }, { "epoch": 1.03, "learning_rate": 5.0233001806709645e-06, "logits/chosen": -1.6608880758285522, "logits/rejected": -1.500573992729187, "logps/chosen": -41.627986907958984, "logps/rejected": -31.012775421142578, "loss": 0.4176, "rewards/accuracies": 1.0, "rewards/chosen": 2.589116334915161, "rewards/margins": 1.8790936470031738, "rewards/rejected": 0.7100227475166321, "step": 4641 }, { "epoch": 1.03, "learning_rate": 5.02150787059636e-06, "logits/chosen": -1.7307446002960205, "logits/rejected": -1.745893955230713, "logps/chosen": -37.614131927490234, "logps/rejected": -81.18724060058594, "loss": 0.5553, "rewards/accuracies": 0.0, "rewards/chosen": 2.2684712409973145, "rewards/margins": -0.36254382133483887, "rewards/rejected": 2.6310150623321533, "step": 4642 }, { "epoch": 1.03, "learning_rate": 5.019715557758046e-06, "logits/chosen": -1.9221867322921753, "logits/rejected": -1.875219464302063, "logps/chosen": -106.70893859863281, "logps/rejected": -63.64155578613281, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 6.199490547180176, "rewards/margins": 4.012267112731934, "rewards/rejected": 2.187223196029663, "step": 4643 }, { "epoch": 1.03, "learning_rate": 5.017923242386329e-06, "logits/chosen": -1.7398425340652466, "logits/rejected": -1.5588247776031494, "logps/chosen": -126.99703979492188, "logps/rejected": -25.78616714477539, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 8.318231582641602, "rewards/margins": 5.9977240562438965, "rewards/rejected": 2.320507526397705, "step": 4644 }, { "epoch": 1.03, "learning_rate": 5.01613092471152e-06, "logits/chosen": -1.8072410821914673, "logits/rejected": -1.8236523866653442, "logps/chosen": -39.43803787231445, "logps/rejected": -59.039005279541016, "loss": 0.8644, "rewards/accuracies": 0.0, "rewards/chosen": 2.988677740097046, "rewards/margins": -0.3909742832183838, "rewards/rejected": 3.3796520233154297, "step": 4645 }, { "epoch": 1.03, "learning_rate": 5.014338604963925e-06, "logits/chosen": -2.031399726867676, "logits/rejected": -2.0146048069000244, "logps/chosen": -46.839317321777344, "logps/rejected": -76.28895568847656, "loss": 0.2191, "rewards/accuracies": 1.0, "rewards/chosen": 7.239771366119385, "rewards/margins": 0.6214609146118164, "rewards/rejected": 6.618310451507568, "step": 4646 }, { "epoch": 1.03, "learning_rate": 5.012546283373853e-06, "logits/chosen": -1.8766543865203857, "logits/rejected": -1.8414973020553589, "logps/chosen": -51.768043518066406, "logps/rejected": -6.86464786529541, "loss": 0.3231, "rewards/accuracies": 1.0, "rewards/chosen": 3.252883195877075, "rewards/margins": 1.341339111328125, "rewards/rejected": 1.9115440845489502, "step": 4647 }, { "epoch": 1.03, "learning_rate": 5.010753960171615e-06, "logits/chosen": -1.9102743864059448, "logits/rejected": -1.8665140867233276, "logps/chosen": -107.99342346191406, "logps/rejected": -95.75274658203125, "loss": 0.1143, "rewards/accuracies": 1.0, "rewards/chosen": 7.32356595993042, "rewards/margins": 1.52199125289917, "rewards/rejected": 5.80157470703125, "step": 4648 }, { "epoch": 1.03, "learning_rate": 5.0089616355875195e-06, "logits/chosen": -1.5912636518478394, "logits/rejected": -1.593926191329956, "logps/chosen": -70.4586181640625, "logps/rejected": -82.44061279296875, "loss": 0.734, "rewards/accuracies": 0.0, "rewards/chosen": 7.2052001953125, "rewards/margins": -0.8422880172729492, "rewards/rejected": 8.04748821258545, "step": 4649 }, { "epoch": 1.03, "learning_rate": 5.007169309851873e-06, "logits/chosen": -1.731695532798767, "logits/rejected": -1.7351863384246826, "logps/chosen": -57.91170883178711, "logps/rejected": -49.11865997314453, "loss": 0.5426, "rewards/accuracies": 0.0, "rewards/chosen": 4.232451438903809, "rewards/margins": -0.6488232612609863, "rewards/rejected": 4.881274700164795, "step": 4650 }, { "epoch": 1.03, "learning_rate": 5.005376983194991e-06, "logits/chosen": -1.7482326030731201, "logits/rejected": -1.7158616781234741, "logps/chosen": -135.59439086914062, "logps/rejected": -168.73130798339844, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": 6.159010410308838, "rewards/margins": 2.680738925933838, "rewards/rejected": 3.478271484375, "step": 4651 }, { "epoch": 1.03, "learning_rate": 5.003584655847177e-06, "logits/chosen": -1.6667007207870483, "logits/rejected": -1.6669621467590332, "logps/chosen": -25.61168670654297, "logps/rejected": -34.93426513671875, "loss": 1.2949, "rewards/accuracies": 0.0, "rewards/chosen": 2.96598744392395, "rewards/margins": -0.507103681564331, "rewards/rejected": 3.4730911254882812, "step": 4652 }, { "epoch": 1.03, "learning_rate": 5.0017923280387435e-06, "logits/chosen": -1.69264554977417, "logits/rejected": -1.8295273780822754, "logps/chosen": -30.65056610107422, "logps/rejected": -56.96965026855469, "loss": 2.42, "rewards/accuracies": 0.0, "rewards/chosen": 4.172068119049072, "rewards/margins": -4.574925899505615, "rewards/rejected": 8.746994018554688, "step": 4653 }, { "epoch": 1.03, "learning_rate": 5e-06, "logits/chosen": -1.5354652404785156, "logits/rejected": -1.592955231666565, "logps/chosen": -39.19244384765625, "logps/rejected": -97.65579986572266, "loss": 0.5791, "rewards/accuracies": 0.0, "rewards/chosen": 4.396157741546631, "rewards/margins": -0.423830509185791, "rewards/rejected": 4.819988250732422, "step": 4654 }, { "epoch": 1.03, "learning_rate": 4.998207671961259e-06, "logits/chosen": -1.901328206062317, "logits/rejected": -1.9405381679534912, "logps/chosen": -47.05616760253906, "logps/rejected": -109.1236801147461, "loss": 0.6713, "rewards/accuracies": 0.0, "rewards/chosen": 7.597666263580322, "rewards/margins": -1.0248808860778809, "rewards/rejected": 8.622547149658203, "step": 4655 }, { "epoch": 1.03, "learning_rate": 4.996415344152824e-06, "logits/chosen": -1.9538555145263672, "logits/rejected": -2.0282387733459473, "logps/chosen": -55.24815368652344, "logps/rejected": -38.088165283203125, "loss": 1.2497, "rewards/accuracies": 1.0, "rewards/chosen": 2.5347793102264404, "rewards/margins": 0.9982012510299683, "rewards/rejected": 1.5365780591964722, "step": 4656 }, { "epoch": 1.03, "learning_rate": 4.994623016805012e-06, "logits/chosen": -1.966303825378418, "logits/rejected": -1.9604172706604004, "logps/chosen": -34.898956298828125, "logps/rejected": -89.96824645996094, "loss": 0.2697, "rewards/accuracies": 1.0, "rewards/chosen": 3.2975754737854004, "rewards/margins": 1.1109216213226318, "rewards/rejected": 2.1866538524627686, "step": 4657 }, { "epoch": 1.03, "learning_rate": 4.9928306901481275e-06, "logits/chosen": -2.11411714553833, "logits/rejected": -2.1282339096069336, "logps/chosen": -64.64068603515625, "logps/rejected": -110.91214752197266, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 6.784080505371094, "rewards/margins": 3.380481719970703, "rewards/rejected": 3.4035987854003906, "step": 4658 }, { "epoch": 1.03, "learning_rate": 4.991038364412482e-06, "logits/chosen": -1.953898310661316, "logits/rejected": -1.912373661994934, "logps/chosen": -66.18382263183594, "logps/rejected": -68.32626342773438, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 4.626903057098389, "rewards/margins": 2.18251895904541, "rewards/rejected": 2.4443840980529785, "step": 4659 }, { "epoch": 1.03, "learning_rate": 4.989246039828386e-06, "logits/chosen": -1.6218645572662354, "logits/rejected": -1.7486517429351807, "logps/chosen": -74.54658508300781, "logps/rejected": -143.6463623046875, "loss": 2.2501, "rewards/accuracies": 0.0, "rewards/chosen": 6.588139533996582, "rewards/margins": -4.474825859069824, "rewards/rejected": 11.062965393066406, "step": 4660 }, { "epoch": 1.03, "learning_rate": 4.987453716626148e-06, "logits/chosen": -1.8637456893920898, "logits/rejected": -1.8628275394439697, "logps/chosen": -113.74400329589844, "logps/rejected": -61.767547607421875, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": 10.670458793640137, "rewards/margins": 2.5167160034179688, "rewards/rejected": 8.153742790222168, "step": 4661 }, { "epoch": 1.03, "learning_rate": 4.985661395036076e-06, "logits/chosen": -1.6303616762161255, "logits/rejected": -1.6303616762161255, "logps/chosen": -3.1637766361236572, "logps/rejected": -3.1637766361236572, "loss": 0.3555, "rewards/accuracies": 0.0, "rewards/chosen": 1.3943073749542236, "rewards/margins": 0.0, "rewards/rejected": 1.3943073749542236, "step": 4662 }, { "epoch": 1.03, "learning_rate": 4.983869075288482e-06, "logits/chosen": -1.9555184841156006, "logits/rejected": -1.9234870672225952, "logps/chosen": -90.24942016601562, "logps/rejected": -85.93980407714844, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 10.35871410369873, "rewards/margins": 2.8997817039489746, "rewards/rejected": 7.458932399749756, "step": 4663 }, { "epoch": 1.03, "learning_rate": 4.982076757613671e-06, "logits/chosen": -1.850281000137329, "logits/rejected": -1.4429715871810913, "logps/chosen": -36.10865020751953, "logps/rejected": -79.40034484863281, "loss": 0.4932, "rewards/accuracies": 0.0, "rewards/chosen": 2.8000328540802, "rewards/margins": -0.4538002014160156, "rewards/rejected": 3.253833055496216, "step": 4664 }, { "epoch": 1.03, "learning_rate": 4.9802844422419555e-06, "logits/chosen": -1.8765658140182495, "logits/rejected": -1.7711561918258667, "logps/chosen": -66.1058349609375, "logps/rejected": -147.80885314941406, "loss": 0.2609, "rewards/accuracies": 1.0, "rewards/chosen": 7.5492095947265625, "rewards/margins": 0.47756338119506836, "rewards/rejected": 7.071646213531494, "step": 4665 }, { "epoch": 1.03, "learning_rate": 4.978492129403642e-06, "logits/chosen": -2.056075096130371, "logits/rejected": -2.077714443206787, "logps/chosen": -105.54016876220703, "logps/rejected": -95.12056732177734, "loss": 0.1025, "rewards/accuracies": 1.0, "rewards/chosen": 9.87375545501709, "rewards/margins": 1.640157699584961, "rewards/rejected": 8.233597755432129, "step": 4666 }, { "epoch": 1.03, "learning_rate": 4.976699819329037e-06, "logits/chosen": -1.8818424940109253, "logits/rejected": -1.770484209060669, "logps/chosen": -82.3134765625, "logps/rejected": -49.99309539794922, "loss": 0.19, "rewards/accuracies": 1.0, "rewards/chosen": 9.201573371887207, "rewards/margins": 2.5973076820373535, "rewards/rejected": 6.6042656898498535, "step": 4667 }, { "epoch": 1.03, "learning_rate": 4.974907512248451e-06, "logits/chosen": -2.1838276386260986, "logits/rejected": -2.164050340652466, "logps/chosen": -91.64533996582031, "logps/rejected": -46.469139099121094, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": 5.152072429656982, "rewards/margins": 2.3982088565826416, "rewards/rejected": 2.753863573074341, "step": 4668 }, { "epoch": 1.03, "learning_rate": 4.973115208392186e-06, "logits/chosen": -1.7326288223266602, "logits/rejected": -1.6603705883026123, "logps/chosen": -23.726089477539062, "logps/rejected": -14.287925720214844, "loss": 0.4959, "rewards/accuracies": 1.0, "rewards/chosen": 1.8420284986495972, "rewards/margins": 1.055525779724121, "rewards/rejected": 0.7865026593208313, "step": 4669 }, { "epoch": 1.03, "learning_rate": 4.971322907990555e-06, "logits/chosen": -1.8672642707824707, "logits/rejected": -1.6696438789367676, "logps/chosen": -71.36251831054688, "logps/rejected": -34.969825744628906, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 4.669954776763916, "rewards/margins": 4.898231029510498, "rewards/rejected": -0.22827644646167755, "step": 4670 }, { "epoch": 1.03, "learning_rate": 4.969530611273859e-06, "logits/chosen": -2.019312858581543, "logits/rejected": -1.9772162437438965, "logps/chosen": -52.82798767089844, "logps/rejected": -48.79505920410156, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": 6.114359378814697, "rewards/margins": 2.2326927185058594, "rewards/rejected": 3.881666660308838, "step": 4671 }, { "epoch": 1.03, "learning_rate": 4.967738318472408e-06, "logits/chosen": -1.8293668031692505, "logits/rejected": -1.8131771087646484, "logps/chosen": -122.56271362304688, "logps/rejected": -137.0241241455078, "loss": 0.1043, "rewards/accuracies": 1.0, "rewards/chosen": 10.754281997680664, "rewards/margins": 1.481607437133789, "rewards/rejected": 9.272674560546875, "step": 4672 }, { "epoch": 1.03, "learning_rate": 4.965946029816504e-06, "logits/chosen": -1.7861158847808838, "logits/rejected": -1.6769429445266724, "logps/chosen": -43.73383331298828, "logps/rejected": -49.21772384643555, "loss": 0.5218, "rewards/accuracies": 1.0, "rewards/chosen": 2.3354203701019287, "rewards/margins": 0.14093756675720215, "rewards/rejected": 2.1944828033447266, "step": 4673 }, { "epoch": 1.03, "learning_rate": 4.964153745536455e-06, "logits/chosen": -1.9167511463165283, "logits/rejected": -1.8287447690963745, "logps/chosen": -76.82490539550781, "logps/rejected": -100.59860229492188, "loss": 0.6446, "rewards/accuracies": 0.0, "rewards/chosen": 8.211714744567871, "rewards/margins": -0.931208610534668, "rewards/rejected": 9.142923355102539, "step": 4674 }, { "epoch": 1.03, "learning_rate": 4.962361465862564e-06, "logits/chosen": -2.1000185012817383, "logits/rejected": -2.0832698345184326, "logps/chosen": -39.34258270263672, "logps/rejected": -63.23694610595703, "loss": 0.868, "rewards/accuracies": 0.0, "rewards/chosen": 4.72567081451416, "rewards/margins": -1.4697751998901367, "rewards/rejected": 6.195446014404297, "step": 4675 }, { "epoch": 1.03, "learning_rate": 4.960569191025133e-06, "logits/chosen": -1.7049018144607544, "logits/rejected": -1.6801249980926514, "logps/chosen": -33.618125915527344, "logps/rejected": -41.449501037597656, "loss": 2.6039, "rewards/accuracies": 1.0, "rewards/chosen": 3.031010389328003, "rewards/margins": 0.1736128330230713, "rewards/rejected": 2.8573975563049316, "step": 4676 }, { "epoch": 1.04, "learning_rate": 4.958776921254467e-06, "logits/chosen": -2.024864673614502, "logits/rejected": -1.953014850616455, "logps/chosen": -140.50057983398438, "logps/rejected": -35.60498046875, "loss": 0.0649, "rewards/accuracies": 1.0, "rewards/chosen": 5.213589668273926, "rewards/margins": 3.117669105529785, "rewards/rejected": 2.0959205627441406, "step": 4677 }, { "epoch": 1.04, "learning_rate": 4.956984656780866e-06, "logits/chosen": -2.0575265884399414, "logits/rejected": -2.0569517612457275, "logps/chosen": -73.53472900390625, "logps/rejected": -61.09901428222656, "loss": 0.122, "rewards/accuracies": 1.0, "rewards/chosen": 6.28097677230835, "rewards/margins": 1.4347763061523438, "rewards/rejected": 4.846200466156006, "step": 4678 }, { "epoch": 1.04, "learning_rate": 4.955192397834635e-06, "logits/chosen": -1.949314832687378, "logits/rejected": -1.9490940570831299, "logps/chosen": -47.201358795166016, "logps/rejected": -47.71358108520508, "loss": 0.3234, "rewards/accuracies": 1.0, "rewards/chosen": 3.9679791927337646, "rewards/margins": 0.13782596588134766, "rewards/rejected": 3.830153226852417, "step": 4679 }, { "epoch": 1.04, "learning_rate": 4.95340014464607e-06, "logits/chosen": -1.9399299621582031, "logits/rejected": -2.024521589279175, "logps/chosen": -53.980831146240234, "logps/rejected": -61.36155700683594, "loss": 2.1487, "rewards/accuracies": 0.0, "rewards/chosen": 3.4582126140594482, "rewards/margins": -3.7444517612457275, "rewards/rejected": 7.202664375305176, "step": 4680 }, { "epoch": 1.04, "learning_rate": 4.951607897445477e-06, "logits/chosen": -1.7479325532913208, "logits/rejected": -1.6927493810653687, "logps/chosen": -186.8328857421875, "logps/rejected": -44.81814193725586, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 7.446994304656982, "rewards/margins": 2.88145112991333, "rewards/rejected": 4.565543174743652, "step": 4681 }, { "epoch": 1.04, "learning_rate": 4.949815656463151e-06, "logits/chosen": -1.82835054397583, "logits/rejected": -1.7140460014343262, "logps/chosen": -51.341922760009766, "logps/rejected": -31.48127555847168, "loss": 0.6039, "rewards/accuracies": 1.0, "rewards/chosen": 5.956946849822998, "rewards/margins": 2.414435863494873, "rewards/rejected": 3.542510986328125, "step": 4682 }, { "epoch": 1.04, "learning_rate": 4.948023421929395e-06, "logits/chosen": -2.0227437019348145, "logits/rejected": -2.015216112136841, "logps/chosen": -72.76593780517578, "logps/rejected": -87.51836395263672, "loss": 0.0642, "rewards/accuracies": 1.0, "rewards/chosen": 8.15421199798584, "rewards/margins": 2.2162137031555176, "rewards/rejected": 5.937998294830322, "step": 4683 }, { "epoch": 1.04, "learning_rate": 4.946231194074503e-06, "logits/chosen": -2.0186257362365723, "logits/rejected": -1.9821487665176392, "logps/chosen": -114.65422058105469, "logps/rejected": -171.78573608398438, "loss": 0.509, "rewards/accuracies": 1.0, "rewards/chosen": 7.025891304016113, "rewards/margins": 0.3057570457458496, "rewards/rejected": 6.720134258270264, "step": 4684 }, { "epoch": 1.04, "learning_rate": 4.944438973128775e-06, "logits/chosen": -1.9189008474349976, "logits/rejected": -1.7405641078948975, "logps/chosen": -92.3780288696289, "logps/rejected": -42.966346740722656, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 7.226474761962891, "rewards/margins": 2.634772300720215, "rewards/rejected": 4.591702461242676, "step": 4685 }, { "epoch": 1.04, "learning_rate": 4.942646759322504e-06, "logits/chosen": -2.001858949661255, "logits/rejected": -2.0024919509887695, "logps/chosen": -51.66548538208008, "logps/rejected": -82.73310852050781, "loss": 1.0965, "rewards/accuracies": 0.0, "rewards/chosen": 4.818571090698242, "rewards/margins": -2.0481204986572266, "rewards/rejected": 6.866691589355469, "step": 4686 }, { "epoch": 1.04, "learning_rate": 4.940854552885985e-06, "logits/chosen": -2.080456018447876, "logits/rejected": -2.080456018447876, "logps/chosen": -37.19357681274414, "logps/rejected": -37.19357681274414, "loss": 0.3541, "rewards/accuracies": 0.0, "rewards/chosen": 4.248605251312256, "rewards/margins": 0.0, "rewards/rejected": 4.248605251312256, "step": 4687 }, { "epoch": 1.04, "learning_rate": 4.939062354049516e-06, "logits/chosen": -1.9865212440490723, "logits/rejected": -1.9682725667953491, "logps/chosen": -26.3705997467041, "logps/rejected": -32.219295501708984, "loss": 0.4727, "rewards/accuracies": 1.0, "rewards/chosen": 2.98954176902771, "rewards/margins": 1.6629109382629395, "rewards/rejected": 1.3266308307647705, "step": 4688 }, { "epoch": 1.04, "learning_rate": 4.937270163043386e-06, "logits/chosen": -1.7507541179656982, "logits/rejected": -1.774610996246338, "logps/chosen": -29.18746566772461, "logps/rejected": -55.35809326171875, "loss": 0.4842, "rewards/accuracies": 0.0, "rewards/chosen": 4.571037769317627, "rewards/margins": -0.4559459686279297, "rewards/rejected": 5.026983737945557, "step": 4689 }, { "epoch": 1.04, "learning_rate": 4.935477980097891e-06, "logits/chosen": -1.8616752624511719, "logits/rejected": -1.8616752624511719, "logps/chosen": -56.60207748413086, "logps/rejected": -56.60207748413086, "loss": 0.3667, "rewards/accuracies": 0.0, "rewards/chosen": 6.256967544555664, "rewards/margins": 0.0, "rewards/rejected": 6.256967544555664, "step": 4690 }, { "epoch": 1.04, "learning_rate": 4.93368580544332e-06, "logits/chosen": -1.8221981525421143, "logits/rejected": -1.83463454246521, "logps/chosen": -47.00725555419922, "logps/rejected": -62.773345947265625, "loss": 0.2203, "rewards/accuracies": 1.0, "rewards/chosen": 4.769052982330322, "rewards/margins": 0.6388540267944336, "rewards/rejected": 4.130198955535889, "step": 4691 }, { "epoch": 1.04, "learning_rate": 4.931893639309963e-06, "logits/chosen": -1.8180357217788696, "logits/rejected": -1.7504862546920776, "logps/chosen": -39.10301971435547, "logps/rejected": -64.97140502929688, "loss": 0.1281, "rewards/accuracies": 1.0, "rewards/chosen": 3.0376222133636475, "rewards/margins": 1.295369029045105, "rewards/rejected": 1.7422531843185425, "step": 4692 }, { "epoch": 1.04, "learning_rate": 4.93010148192811e-06, "logits/chosen": -1.6530681848526, "logits/rejected": -1.579697847366333, "logps/chosen": -150.37689208984375, "logps/rejected": -46.01978302001953, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": 7.939566135406494, "rewards/margins": 4.336915016174316, "rewards/rejected": 3.602651357650757, "step": 4693 }, { "epoch": 1.04, "learning_rate": 4.928309333528049e-06, "logits/chosen": -2.1308767795562744, "logits/rejected": -2.0421297550201416, "logps/chosen": -70.576171875, "logps/rejected": -61.09782028198242, "loss": 1.0665, "rewards/accuracies": 0.0, "rewards/chosen": 5.326338291168213, "rewards/margins": -1.839961051940918, "rewards/rejected": 7.166299343109131, "step": 4694 }, { "epoch": 1.04, "learning_rate": 4.926517194340067e-06, "logits/chosen": -1.9731462001800537, "logits/rejected": -1.9450974464416504, "logps/chosen": -91.68524169921875, "logps/rejected": -100.70146179199219, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": 5.710505962371826, "rewards/margins": 3.482239007949829, "rewards/rejected": 2.228266954421997, "step": 4695 }, { "epoch": 1.04, "learning_rate": 4.924725064594448e-06, "logits/chosen": -1.8963043689727783, "logits/rejected": -1.8042075634002686, "logps/chosen": -166.08868408203125, "logps/rejected": -50.29716873168945, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 8.96030330657959, "rewards/margins": 3.794811248779297, "rewards/rejected": 5.165492057800293, "step": 4696 }, { "epoch": 1.04, "learning_rate": 4.922932944521477e-06, "logits/chosen": -1.6180486679077148, "logits/rejected": -1.5872714519500732, "logps/chosen": -28.50298309326172, "logps/rejected": -21.606502532958984, "loss": 0.6539, "rewards/accuracies": 0.0, "rewards/chosen": 2.986182451248169, "rewards/margins": -0.9525761604309082, "rewards/rejected": 3.938758611679077, "step": 4697 }, { "epoch": 1.04, "learning_rate": 4.921140834351436e-06, "logits/chosen": -1.7313857078552246, "logits/rejected": -1.712307333946228, "logps/chosen": -85.04261779785156, "logps/rejected": -63.131103515625, "loss": 1.2726, "rewards/accuracies": 1.0, "rewards/chosen": 4.256776332855225, "rewards/margins": 2.155207633972168, "rewards/rejected": 2.1015686988830566, "step": 4698 }, { "epoch": 1.04, "learning_rate": 4.9193487343146105e-06, "logits/chosen": -1.7136621475219727, "logits/rejected": -1.817397952079773, "logps/chosen": -47.789859771728516, "logps/rejected": -121.1159896850586, "loss": 1.5858, "rewards/accuracies": 0.0, "rewards/chosen": 5.313747882843018, "rewards/margins": -3.110778331756592, "rewards/rejected": 8.42452621459961, "step": 4699 }, { "epoch": 1.04, "learning_rate": 4.9175566446412785e-06, "logits/chosen": -2.0486488342285156, "logits/rejected": -1.9434611797332764, "logps/chosen": -30.031068801879883, "logps/rejected": -30.71666717529297, "loss": 0.6797, "rewards/accuracies": 0.0, "rewards/chosen": 2.895751714706421, "rewards/margins": -0.6298952102661133, "rewards/rejected": 3.525646924972534, "step": 4700 }, { "epoch": 1.04, "learning_rate": 4.915764565561719e-06, "logits/chosen": -1.887003779411316, "logits/rejected": -1.8588390350341797, "logps/chosen": -64.25885009765625, "logps/rejected": -65.90155792236328, "loss": 0.9129, "rewards/accuracies": 1.0, "rewards/chosen": 5.664947509765625, "rewards/margins": 1.8414435386657715, "rewards/rejected": 3.8235039710998535, "step": 4701 }, { "epoch": 1.04, "learning_rate": 4.9139724973062096e-06, "logits/chosen": -1.7065190076828003, "logits/rejected": -1.6669317483901978, "logps/chosen": -98.21414947509766, "logps/rejected": -124.30204772949219, "loss": 0.0877, "rewards/accuracies": 1.0, "rewards/chosen": 10.903217315673828, "rewards/margins": 2.8603782653808594, "rewards/rejected": 8.042839050292969, "step": 4702 }, { "epoch": 1.04, "learning_rate": 4.912180440105028e-06, "logits/chosen": -1.6958376169204712, "logits/rejected": -1.6701587438583374, "logps/chosen": -91.5574722290039, "logps/rejected": -73.13390350341797, "loss": 0.1365, "rewards/accuracies": 1.0, "rewards/chosen": 8.813586235046387, "rewards/margins": 1.5498003959655762, "rewards/rejected": 7.2637858390808105, "step": 4703 }, { "epoch": 1.04, "learning_rate": 4.9103883941884475e-06, "logits/chosen": -1.7080174684524536, "logits/rejected": -1.6368545293807983, "logps/chosen": -49.370155334472656, "logps/rejected": -34.693389892578125, "loss": 0.2662, "rewards/accuracies": 1.0, "rewards/chosen": 5.159615516662598, "rewards/margins": 0.37403202056884766, "rewards/rejected": 4.78558349609375, "step": 4704 }, { "epoch": 1.04, "learning_rate": 4.908596359786745e-06, "logits/chosen": -1.9584882259368896, "logits/rejected": -1.967518925666809, "logps/chosen": -88.23196411132812, "logps/rejected": -80.18186950683594, "loss": 0.1965, "rewards/accuracies": 1.0, "rewards/chosen": 7.2562103271484375, "rewards/margins": 0.8971419334411621, "rewards/rejected": 6.359068393707275, "step": 4705 }, { "epoch": 1.04, "learning_rate": 4.906804337130191e-06, "logits/chosen": -1.9797595739364624, "logits/rejected": -1.9157993793487549, "logps/chosen": -93.67361450195312, "logps/rejected": -77.7356948852539, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": 10.101419448852539, "rewards/margins": 2.381394386291504, "rewards/rejected": 7.720025062561035, "step": 4706 }, { "epoch": 1.04, "learning_rate": 4.905012326449053e-06, "logits/chosen": -1.7012511491775513, "logits/rejected": -1.6978421211242676, "logps/chosen": -73.65130615234375, "logps/rejected": -72.4156494140625, "loss": 0.2716, "rewards/accuracies": 1.0, "rewards/chosen": 3.341731309890747, "rewards/margins": 0.5331521034240723, "rewards/rejected": 2.808579206466675, "step": 4707 }, { "epoch": 1.04, "learning_rate": 4.903220327973605e-06, "logits/chosen": -1.4139753580093384, "logits/rejected": -1.400488257408142, "logps/chosen": -75.78406524658203, "logps/rejected": -73.28096008300781, "loss": 0.8544, "rewards/accuracies": 0.0, "rewards/chosen": 1.077539086341858, "rewards/margins": -1.4789115190505981, "rewards/rejected": 2.556450605392456, "step": 4708 }, { "epoch": 1.04, "learning_rate": 4.901428341934112e-06, "logits/chosen": -1.774571418762207, "logits/rejected": -1.8118778467178345, "logps/chosen": -38.08140563964844, "logps/rejected": -89.82257080078125, "loss": 1.6837, "rewards/accuracies": 0.0, "rewards/chosen": 3.862844944000244, "rewards/margins": -2.8088531494140625, "rewards/rejected": 6.671698093414307, "step": 4709 }, { "epoch": 1.04, "learning_rate": 4.89963636856084e-06, "logits/chosen": -1.7825608253479004, "logits/rejected": -1.825745701789856, "logps/chosen": -40.23689270019531, "logps/rejected": -103.496337890625, "loss": 1.2813, "rewards/accuracies": 0.0, "rewards/chosen": 4.2940263748168945, "rewards/margins": -2.4225997924804688, "rewards/rejected": 6.716626167297363, "step": 4710 }, { "epoch": 1.04, "learning_rate": 4.897844408084051e-06, "logits/chosen": -1.9581679105758667, "logits/rejected": -1.946406602859497, "logps/chosen": -78.87741088867188, "logps/rejected": -65.1824722290039, "loss": 0.5232, "rewards/accuracies": 0.0, "rewards/chosen": 4.5988287925720215, "rewards/margins": -0.5307650566101074, "rewards/rejected": 5.129593849182129, "step": 4711 }, { "epoch": 1.04, "learning_rate": 4.896052460734013e-06, "logits/chosen": -2.0012564659118652, "logits/rejected": -2.017477035522461, "logps/chosen": -75.29442596435547, "logps/rejected": -83.98857116699219, "loss": 0.2094, "rewards/accuracies": 1.0, "rewards/chosen": 5.874730587005615, "rewards/margins": 0.8555121421813965, "rewards/rejected": 5.019218444824219, "step": 4712 }, { "epoch": 1.04, "learning_rate": 4.894260526740982e-06, "logits/chosen": -2.0312132835388184, "logits/rejected": -2.0557520389556885, "logps/chosen": -45.21769332885742, "logps/rejected": -72.18390655517578, "loss": 1.7875, "rewards/accuracies": 1.0, "rewards/chosen": 3.450754165649414, "rewards/margins": 0.253159761428833, "rewards/rejected": 3.197594404220581, "step": 4713 }, { "epoch": 1.04, "learning_rate": 4.89246860633522e-06, "logits/chosen": -1.6991387605667114, "logits/rejected": -1.6991387605667114, "logps/chosen": -31.802173614501953, "logps/rejected": -31.802173614501953, "loss": 0.3649, "rewards/accuracies": 0.0, "rewards/chosen": 7.6613922119140625, "rewards/margins": 0.0, "rewards/rejected": 7.6613922119140625, "step": 4714 }, { "epoch": 1.04, "learning_rate": 4.890676699746982e-06, "logits/chosen": -1.873939037322998, "logits/rejected": -1.8009693622589111, "logps/chosen": -26.6982479095459, "logps/rejected": -6.466404438018799, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": 2.8345625400543213, "rewards/margins": 2.2091195583343506, "rewards/rejected": 0.6254429221153259, "step": 4715 }, { "epoch": 1.04, "learning_rate": 4.888884807206528e-06, "logits/chosen": -1.666218876838684, "logits/rejected": -1.6139076948165894, "logps/chosen": -46.178646087646484, "logps/rejected": -44.455810546875, "loss": 0.5561, "rewards/accuracies": 0.0, "rewards/chosen": 2.6233417987823486, "rewards/margins": -0.6182048320770264, "rewards/rejected": 3.241546630859375, "step": 4716 }, { "epoch": 1.04, "learning_rate": 4.887092928944109e-06, "logits/chosen": -1.9790748357772827, "logits/rejected": -1.8655357360839844, "logps/chosen": -97.12470245361328, "logps/rejected": -54.677894592285156, "loss": 0.0867, "rewards/accuracies": 1.0, "rewards/chosen": 7.622835636138916, "rewards/margins": 2.846696376800537, "rewards/rejected": 4.776139259338379, "step": 4717 }, { "epoch": 1.04, "learning_rate": 4.8853010651899755e-06, "logits/chosen": -2.0334770679473877, "logits/rejected": -1.8673020601272583, "logps/chosen": -108.23524475097656, "logps/rejected": -27.529895782470703, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": 5.978895664215088, "rewards/margins": 5.437168121337891, "rewards/rejected": 0.5417274832725525, "step": 4718 }, { "epoch": 1.04, "learning_rate": 4.883509216174381e-06, "logits/chosen": -1.9315563440322876, "logits/rejected": -1.9166685342788696, "logps/chosen": -69.58343505859375, "logps/rejected": -72.74336242675781, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": 5.512117862701416, "rewards/margins": 2.5019493103027344, "rewards/rejected": 3.0101685523986816, "step": 4719 }, { "epoch": 1.04, "learning_rate": 4.88171738212757e-06, "logits/chosen": -1.8986644744873047, "logits/rejected": -1.8680709600448608, "logps/chosen": -65.08976745605469, "logps/rejected": -51.12139129638672, "loss": 0.9478, "rewards/accuracies": 0.0, "rewards/chosen": 1.1123474836349487, "rewards/margins": -1.7105377912521362, "rewards/rejected": 2.822885274887085, "step": 4720 }, { "epoch": 1.04, "learning_rate": 4.879925563279793e-06, "logits/chosen": -1.8310261964797974, "logits/rejected": -1.6902507543563843, "logps/chosen": -76.65755462646484, "logps/rejected": -18.50385856628418, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 3.9302239418029785, "rewards/margins": 3.185206651687622, "rewards/rejected": 0.7450172305107117, "step": 4721 }, { "epoch": 1.05, "learning_rate": 4.87813375986129e-06, "logits/chosen": -1.9003630876541138, "logits/rejected": -1.9336315393447876, "logps/chosen": -51.60590744018555, "logps/rejected": -90.6361312866211, "loss": 0.4619, "rewards/accuracies": 0.0, "rewards/chosen": 6.03915548324585, "rewards/margins": -0.38803672790527344, "rewards/rejected": 6.427192211151123, "step": 4722 }, { "epoch": 1.05, "learning_rate": 4.8763419721023085e-06, "logits/chosen": -1.7914880514144897, "logits/rejected": -1.7717070579528809, "logps/chosen": -39.977622985839844, "logps/rejected": -101.6961669921875, "loss": 3.2771, "rewards/accuracies": 0.0, "rewards/chosen": 3.8342156410217285, "rewards/margins": -6.5023932456970215, "rewards/rejected": 10.33660888671875, "step": 4723 }, { "epoch": 1.05, "learning_rate": 4.874550200233085e-06, "logits/chosen": -1.902753233909607, "logits/rejected": -1.9678819179534912, "logps/chosen": -29.153409957885742, "logps/rejected": -85.52212524414062, "loss": 1.1228, "rewards/accuracies": 0.0, "rewards/chosen": 6.1307196617126465, "rewards/margins": -1.5273690223693848, "rewards/rejected": 7.658088684082031, "step": 4724 }, { "epoch": 1.05, "learning_rate": 4.87275844448386e-06, "logits/chosen": -1.7199708223342896, "logits/rejected": -1.7371762990951538, "logps/chosen": -37.13072967529297, "logps/rejected": -62.04081726074219, "loss": 3.496, "rewards/accuracies": 1.0, "rewards/chosen": 4.9800639152526855, "rewards/margins": 0.07616519927978516, "rewards/rejected": 4.9038987159729, "step": 4725 }, { "epoch": 1.05, "learning_rate": 4.870966705084869e-06, "logits/chosen": -1.8274685144424438, "logits/rejected": -1.7892098426818848, "logps/chosen": -99.78421783447266, "logps/rejected": -55.63334274291992, "loss": 0.1043, "rewards/accuracies": 1.0, "rewards/chosen": 4.23140811920166, "rewards/margins": 2.046215534210205, "rewards/rejected": 2.185192584991455, "step": 4726 }, { "epoch": 1.05, "learning_rate": 4.869174982266346e-06, "logits/chosen": -1.906217098236084, "logits/rejected": -1.8892515897750854, "logps/chosen": -36.816741943359375, "logps/rejected": -67.949951171875, "loss": 1.0856, "rewards/accuracies": 0.0, "rewards/chosen": 5.3029093742370605, "rewards/margins": -1.6183538436889648, "rewards/rejected": 6.921263217926025, "step": 4727 }, { "epoch": 1.05, "learning_rate": 4.867383276258524e-06, "logits/chosen": -1.9340293407440186, "logits/rejected": -1.9094998836517334, "logps/chosen": -88.62687683105469, "logps/rejected": -106.39315032958984, "loss": 0.2058, "rewards/accuracies": 1.0, "rewards/chosen": 9.605422019958496, "rewards/margins": 0.8241481781005859, "rewards/rejected": 8.78127384185791, "step": 4728 }, { "epoch": 1.05, "learning_rate": 4.86559158729163e-06, "logits/chosen": -2.041900157928467, "logits/rejected": -1.9765993356704712, "logps/chosen": -58.03853988647461, "logps/rejected": -52.60747146606445, "loss": 0.1934, "rewards/accuracies": 1.0, "rewards/chosen": 3.272205114364624, "rewards/margins": 0.8262109756469727, "rewards/rejected": 2.4459941387176514, "step": 4729 }, { "epoch": 1.05, "learning_rate": 4.863799915595896e-06, "logits/chosen": -1.6978095769882202, "logits/rejected": -1.7734887599945068, "logps/chosen": -76.47087097167969, "logps/rejected": -162.19903564453125, "loss": 0.6546, "rewards/accuracies": 0.0, "rewards/chosen": 8.883780479431152, "rewards/margins": -0.7259855270385742, "rewards/rejected": 9.609766006469727, "step": 4730 }, { "epoch": 1.05, "learning_rate": 4.862008261401543e-06, "logits/chosen": -1.7617131471633911, "logits/rejected": -1.7773973941802979, "logps/chosen": -58.677860260009766, "logps/rejected": -54.85316848754883, "loss": 1.0545, "rewards/accuracies": 0.0, "rewards/chosen": 3.5738461017608643, "rewards/margins": -1.5611155033111572, "rewards/rejected": 5.1349616050720215, "step": 4731 }, { "epoch": 1.05, "learning_rate": 4.8602166249387995e-06, "logits/chosen": -1.9938371181488037, "logits/rejected": -1.9413232803344727, "logps/chosen": -97.52838134765625, "logps/rejected": -47.89283752441406, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 7.550637722015381, "rewards/margins": 4.879215240478516, "rewards/rejected": 2.6714227199554443, "step": 4732 }, { "epoch": 1.05, "learning_rate": 4.858425006437882e-06, "logits/chosen": -2.0469489097595215, "logits/rejected": -2.097639799118042, "logps/chosen": -52.14929962158203, "logps/rejected": -107.09500122070312, "loss": 0.4884, "rewards/accuracies": 0.0, "rewards/chosen": 5.730713844299316, "rewards/margins": -0.49505090713500977, "rewards/rejected": 6.225764751434326, "step": 4733 }, { "epoch": 1.05, "learning_rate": 4.856633406129011e-06, "logits/chosen": -1.9857022762298584, "logits/rejected": -1.955364465713501, "logps/chosen": -51.51258850097656, "logps/rejected": -42.2841796875, "loss": 0.9152, "rewards/accuracies": 1.0, "rewards/chosen": 3.7276535034179688, "rewards/margins": 0.9337952136993408, "rewards/rejected": 2.793858289718628, "step": 4734 }, { "epoch": 1.05, "learning_rate": 4.854841824242404e-06, "logits/chosen": -1.6593341827392578, "logits/rejected": -1.6552757024765015, "logps/chosen": -31.17511749267578, "logps/rejected": -58.03031539916992, "loss": 0.4566, "rewards/accuracies": 0.0, "rewards/chosen": 2.8458945751190186, "rewards/margins": -0.29879045486450195, "rewards/rejected": 3.1446850299835205, "step": 4735 }, { "epoch": 1.05, "learning_rate": 4.853050261008273e-06, "logits/chosen": -2.105700969696045, "logits/rejected": -2.0927116870880127, "logps/chosen": -60.382659912109375, "logps/rejected": -97.5975341796875, "loss": 0.2358, "rewards/accuracies": 1.0, "rewards/chosen": 8.804941177368164, "rewards/margins": 0.6003751754760742, "rewards/rejected": 8.20456600189209, "step": 4736 }, { "epoch": 1.05, "learning_rate": 4.85125871665683e-06, "logits/chosen": -1.710174560546875, "logits/rejected": -1.7715901136398315, "logps/chosen": -39.593318939208984, "logps/rejected": -50.92921447753906, "loss": 0.8617, "rewards/accuracies": 0.0, "rewards/chosen": 3.4435017108917236, "rewards/margins": -1.1713817119598389, "rewards/rejected": 4.6148834228515625, "step": 4737 }, { "epoch": 1.05, "learning_rate": 4.8494671914182835e-06, "logits/chosen": -2.0544466972351074, "logits/rejected": -1.953137755393982, "logps/chosen": -108.42201232910156, "logps/rejected": -17.907155990600586, "loss": 0.1037, "rewards/accuracies": 1.0, "rewards/chosen": 6.675880432128906, "rewards/margins": 6.329734802246094, "rewards/rejected": 0.3461456298828125, "step": 4738 }, { "epoch": 1.05, "learning_rate": 4.847675685522842e-06, "logits/chosen": -1.7617515325546265, "logits/rejected": -1.7472646236419678, "logps/chosen": -3.1522178649902344, "logps/rejected": -18.14903450012207, "loss": 0.2909, "rewards/accuracies": 1.0, "rewards/chosen": 0.7871498465538025, "rewards/margins": 0.2432422637939453, "rewards/rejected": 0.5439075827598572, "step": 4739 }, { "epoch": 1.05, "learning_rate": 4.8458841992007074e-06, "logits/chosen": -1.6984013319015503, "logits/rejected": -1.706176996231079, "logps/chosen": -46.48384094238281, "logps/rejected": -25.339244842529297, "loss": 0.5567, "rewards/accuracies": 1.0, "rewards/chosen": 3.543048143386841, "rewards/margins": 0.6495075225830078, "rewards/rejected": 2.893540620803833, "step": 4740 }, { "epoch": 1.05, "learning_rate": 4.8440927326820845e-06, "logits/chosen": -1.6676472425460815, "logits/rejected": -1.6234270334243774, "logps/chosen": -76.92514038085938, "logps/rejected": -102.27899169921875, "loss": 0.1316, "rewards/accuracies": 1.0, "rewards/chosen": 5.300347805023193, "rewards/margins": 2.4158904552459717, "rewards/rejected": 2.8844573497772217, "step": 4741 }, { "epoch": 1.05, "learning_rate": 4.842301286197171e-06, "logits/chosen": -1.9390286207199097, "logits/rejected": -1.9927083253860474, "logps/chosen": -56.464473724365234, "logps/rejected": -118.38737487792969, "loss": 2.3901, "rewards/accuracies": 0.0, "rewards/chosen": 3.268707752227783, "rewards/margins": -4.769364833831787, "rewards/rejected": 8.03807258605957, "step": 4742 }, { "epoch": 1.05, "learning_rate": 4.840509859976162e-06, "logits/chosen": -2.0269038677215576, "logits/rejected": -1.7895050048828125, "logps/chosen": -174.51223754882812, "logps/rejected": -49.92441940307617, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 7.303946018218994, "rewards/margins": 3.150834083557129, "rewards/rejected": 4.153111934661865, "step": 4743 }, { "epoch": 1.05, "learning_rate": 4.8387184542492515e-06, "logits/chosen": -1.9429957866668701, "logits/rejected": -1.923067569732666, "logps/chosen": -34.046630859375, "logps/rejected": -45.90400695800781, "loss": 0.1532, "rewards/accuracies": 1.0, "rewards/chosen": 3.4613304138183594, "rewards/margins": 1.0258934497833252, "rewards/rejected": 2.435436964035034, "step": 4744 }, { "epoch": 1.05, "learning_rate": 4.836927069246633e-06, "logits/chosen": -2.0519418716430664, "logits/rejected": -1.9565056562423706, "logps/chosen": -128.30638122558594, "logps/rejected": -81.4230728149414, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 7.129155158996582, "rewards/margins": 2.497281551361084, "rewards/rejected": 4.631873607635498, "step": 4745 }, { "epoch": 1.05, "learning_rate": 4.835135705198496e-06, "logits/chosen": -1.9977964162826538, "logits/rejected": -2.0124614238739014, "logps/chosen": -31.45690155029297, "logps/rejected": -42.46460723876953, "loss": 1.6024, "rewards/accuracies": 1.0, "rewards/chosen": 4.0558295249938965, "rewards/margins": 0.33353185653686523, "rewards/rejected": 3.7222976684570312, "step": 4746 }, { "epoch": 1.05, "learning_rate": 4.833344362335022e-06, "logits/chosen": -1.5624687671661377, "logits/rejected": -1.5276519060134888, "logps/chosen": -40.5962028503418, "logps/rejected": -61.509765625, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": 4.176397323608398, "rewards/margins": 1.8267757892608643, "rewards/rejected": 2.349621534347534, "step": 4747 }, { "epoch": 1.05, "learning_rate": 4.8315530408863975e-06, "logits/chosen": -1.6334375143051147, "logits/rejected": -1.5766518115997314, "logps/chosen": -49.145992279052734, "logps/rejected": -62.1168327331543, "loss": 0.3006, "rewards/accuracies": 1.0, "rewards/chosen": 2.156625747680664, "rewards/margins": 0.390788197517395, "rewards/rejected": 1.765837550163269, "step": 4748 }, { "epoch": 1.05, "learning_rate": 4.829761741082802e-06, "logits/chosen": -1.9156657457351685, "logits/rejected": -1.8668464422225952, "logps/chosen": -108.8296127319336, "logps/rejected": -39.78553771972656, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 5.931020259857178, "rewards/margins": 3.1243391036987305, "rewards/rejected": 2.8066811561584473, "step": 4749 }, { "epoch": 1.05, "learning_rate": 4.8279704631544136e-06, "logits/chosen": -1.994295597076416, "logits/rejected": -1.8808856010437012, "logps/chosen": -48.93707275390625, "logps/rejected": -15.646937370300293, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 5.05355978012085, "rewards/margins": 4.365302085876465, "rewards/rejected": 0.6882576942443848, "step": 4750 }, { "epoch": 1.05, "learning_rate": 4.826179207331408e-06, "logits/chosen": -1.5394001007080078, "logits/rejected": -1.5869121551513672, "logps/chosen": -50.91670227050781, "logps/rejected": -53.279258728027344, "loss": 0.955, "rewards/accuracies": 1.0, "rewards/chosen": 4.267640113830566, "rewards/margins": 0.12192296981811523, "rewards/rejected": 4.145717144012451, "step": 4751 }, { "epoch": 1.05, "learning_rate": 4.824387973843957e-06, "logits/chosen": -1.8017137050628662, "logits/rejected": -1.797567367553711, "logps/chosen": -67.98607635498047, "logps/rejected": -43.21749496459961, "loss": 0.251, "rewards/accuracies": 1.0, "rewards/chosen": 3.4552483558654785, "rewards/margins": 0.46294212341308594, "rewards/rejected": 2.9923062324523926, "step": 4752 }, { "epoch": 1.05, "learning_rate": 4.822596762922226e-06, "logits/chosen": -1.8933595418930054, "logits/rejected": -1.8920924663543701, "logps/chosen": -76.23757934570312, "logps/rejected": -76.75787353515625, "loss": 0.3391, "rewards/accuracies": 1.0, "rewards/chosen": 4.484765529632568, "rewards/margins": 0.90016770362854, "rewards/rejected": 3.5845978260040283, "step": 4753 }, { "epoch": 1.05, "learning_rate": 4.820805574796387e-06, "logits/chosen": -1.7579971551895142, "logits/rejected": -1.7396800518035889, "logps/chosen": -88.40113830566406, "logps/rejected": -72.23329162597656, "loss": 0.2299, "rewards/accuracies": 1.0, "rewards/chosen": 7.821311950683594, "rewards/margins": 0.5569748878479004, "rewards/rejected": 7.264337062835693, "step": 4754 }, { "epoch": 1.05, "learning_rate": 4.819014409696599e-06, "logits/chosen": -1.658538818359375, "logits/rejected": -1.5847444534301758, "logps/chosen": -28.391639709472656, "logps/rejected": -4.8093647956848145, "loss": 0.1966, "rewards/accuracies": 1.0, "rewards/chosen": 3.1903741359710693, "rewards/margins": 2.194993019104004, "rewards/rejected": 0.9953811764717102, "step": 4755 }, { "epoch": 1.05, "learning_rate": 4.817223267853027e-06, "logits/chosen": -2.0564212799072266, "logits/rejected": -2.025453567504883, "logps/chosen": -85.22903442382812, "logps/rejected": -72.07675170898438, "loss": 0.2284, "rewards/accuracies": 1.0, "rewards/chosen": 5.7962965965271, "rewards/margins": 2.5215086936950684, "rewards/rejected": 3.2747879028320312, "step": 4756 }, { "epoch": 1.05, "learning_rate": 4.815432149495825e-06, "logits/chosen": -2.0078821182250977, "logits/rejected": -1.897152304649353, "logps/chosen": -146.88516235351562, "logps/rejected": -89.74267578125, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": 7.923562526702881, "rewards/margins": 4.259897232055664, "rewards/rejected": 3.6636650562286377, "step": 4757 }, { "epoch": 1.05, "learning_rate": 4.813641054855146e-06, "logits/chosen": -1.995963215827942, "logits/rejected": -1.9498165845870972, "logps/chosen": -132.41436767578125, "logps/rejected": -73.32913208007812, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 6.810086250305176, "rewards/margins": 2.1470537185668945, "rewards/rejected": 4.663032531738281, "step": 4758 }, { "epoch": 1.05, "learning_rate": 4.811849984161146e-06, "logits/chosen": -1.7969295978546143, "logits/rejected": -1.890113115310669, "logps/chosen": -57.164398193359375, "logps/rejected": -91.26409149169922, "loss": 1.2747, "rewards/accuracies": 0.0, "rewards/chosen": 4.86013126373291, "rewards/margins": -2.3992581367492676, "rewards/rejected": 7.259389400482178, "step": 4759 }, { "epoch": 1.05, "learning_rate": 4.810058937643969e-06, "logits/chosen": -1.5649019479751587, "logits/rejected": -1.4774396419525146, "logps/chosen": -38.03117370605469, "logps/rejected": -38.842098236083984, "loss": 0.1244, "rewards/accuracies": 1.0, "rewards/chosen": 3.684659719467163, "rewards/margins": 1.6810340881347656, "rewards/rejected": 2.0036256313323975, "step": 4760 }, { "epoch": 1.05, "learning_rate": 4.8082679155337645e-06, "logits/chosen": -1.895074725151062, "logits/rejected": -1.895074725151062, "logps/chosen": -24.63492202758789, "logps/rejected": -24.63492202758789, "loss": 0.3786, "rewards/accuracies": 0.0, "rewards/chosen": 5.2663187980651855, "rewards/margins": 0.0, "rewards/rejected": 5.2663187980651855, "step": 4761 }, { "epoch": 1.05, "learning_rate": 4.806476918060668e-06, "logits/chosen": -1.4267356395721436, "logits/rejected": -1.4367501735687256, "logps/chosen": -31.146337509155273, "logps/rejected": -31.84376335144043, "loss": 0.5345, "rewards/accuracies": 0.0, "rewards/chosen": 3.422093152999878, "rewards/margins": -0.207122802734375, "rewards/rejected": 3.629215955734253, "step": 4762 }, { "epoch": 1.05, "learning_rate": 4.8046859454548254e-06, "logits/chosen": -1.7408114671707153, "logits/rejected": -1.4938292503356934, "logps/chosen": -167.7406768798828, "logps/rejected": -9.869913101196289, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 8.50890064239502, "rewards/margins": 7.591710567474365, "rewards/rejected": 0.9171900153160095, "step": 4763 }, { "epoch": 1.05, "learning_rate": 4.802894997946367e-06, "logits/chosen": -1.8751274347305298, "logits/rejected": -1.922406554222107, "logps/chosen": -66.48992919921875, "logps/rejected": -101.5016860961914, "loss": 0.1018, "rewards/accuracies": 1.0, "rewards/chosen": 7.090510845184326, "rewards/margins": 1.6210441589355469, "rewards/rejected": 5.469466686248779, "step": 4764 }, { "epoch": 1.05, "learning_rate": 4.801104075765429e-06, "logits/chosen": -2.0422401428222656, "logits/rejected": -2.068873167037964, "logps/chosen": -79.050537109375, "logps/rejected": -161.93565368652344, "loss": 0.1778, "rewards/accuracies": 1.0, "rewards/chosen": 6.6414384841918945, "rewards/margins": 0.8997483253479004, "rewards/rejected": 5.741690158843994, "step": 4765 }, { "epoch": 1.05, "learning_rate": 4.7993131791421385e-06, "logits/chosen": -1.8411788940429688, "logits/rejected": -1.860143780708313, "logps/chosen": -20.464534759521484, "logps/rejected": -67.982666015625, "loss": 0.2192, "rewards/accuracies": 1.0, "rewards/chosen": 3.448688268661499, "rewards/margins": 0.7765798568725586, "rewards/rejected": 2.6721084117889404, "step": 4766 }, { "epoch": 1.06, "learning_rate": 4.797522308306623e-06, "logits/chosen": -1.833044171333313, "logits/rejected": -1.5517868995666504, "logps/chosen": -96.6285171508789, "logps/rejected": -41.83638381958008, "loss": 0.076, "rewards/accuracies": 1.0, "rewards/chosen": 6.368905067443848, "rewards/margins": 1.8620672225952148, "rewards/rejected": 4.506837844848633, "step": 4767 }, { "epoch": 1.06, "learning_rate": 4.795731463489004e-06, "logits/chosen": -1.809098482131958, "logits/rejected": -1.8327889442443848, "logps/chosen": -31.772228240966797, "logps/rejected": -40.360877990722656, "loss": 0.3172, "rewards/accuracies": 1.0, "rewards/chosen": 3.531839370727539, "rewards/margins": 0.19322991371154785, "rewards/rejected": 3.338609457015991, "step": 4768 }, { "epoch": 1.06, "learning_rate": 4.7939406449194005e-06, "logits/chosen": -1.7080638408660889, "logits/rejected": -1.785326600074768, "logps/chosen": -34.695068359375, "logps/rejected": -100.43795013427734, "loss": 1.8652, "rewards/accuracies": 0.0, "rewards/chosen": 3.210314989089966, "rewards/margins": -3.1292335987091064, "rewards/rejected": 6.339548587799072, "step": 4769 }, { "epoch": 1.06, "learning_rate": 4.792149852827929e-06, "logits/chosen": -2.1533162593841553, "logits/rejected": -2.155489683151245, "logps/chosen": -108.26683044433594, "logps/rejected": -85.59541320800781, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 9.00341796875, "rewards/margins": 4.750593662261963, "rewards/rejected": 4.252824306488037, "step": 4770 }, { "epoch": 1.06, "learning_rate": 4.7903590874447e-06, "logits/chosen": -1.7072221040725708, "logits/rejected": -1.8624598979949951, "logps/chosen": -41.795066833496094, "logps/rejected": -142.8140869140625, "loss": 2.2991, "rewards/accuracies": 0.0, "rewards/chosen": 6.019903659820557, "rewards/margins": -4.475464344024658, "rewards/rejected": 10.495368003845215, "step": 4771 }, { "epoch": 1.06, "learning_rate": 4.788568348999827e-06, "logits/chosen": -1.7647227048873901, "logits/rejected": -1.6974177360534668, "logps/chosen": -94.02981567382812, "logps/rejected": -55.56159973144531, "loss": 0.0964, "rewards/accuracies": 1.0, "rewards/chosen": 4.835614204406738, "rewards/margins": 1.554518222808838, "rewards/rejected": 3.2810959815979004, "step": 4772 }, { "epoch": 1.06, "learning_rate": 4.786777637723411e-06, "logits/chosen": -1.9642348289489746, "logits/rejected": -1.9870212078094482, "logps/chosen": -93.34559631347656, "logps/rejected": -41.71110916137695, "loss": 0.2079, "rewards/accuracies": 1.0, "rewards/chosen": 6.03956937789917, "rewards/margins": 0.6677112579345703, "rewards/rejected": 5.3718581199646, "step": 4773 }, { "epoch": 1.06, "learning_rate": 4.784986953845557e-06, "logits/chosen": -1.9550753831863403, "logits/rejected": -1.9289655685424805, "logps/chosen": -84.12323760986328, "logps/rejected": -104.75336456298828, "loss": 0.1467, "rewards/accuracies": 1.0, "rewards/chosen": 6.793459415435791, "rewards/margins": 1.2206296920776367, "rewards/rejected": 5.572829723358154, "step": 4774 }, { "epoch": 1.06, "learning_rate": 4.783196297596362e-06, "logits/chosen": -1.8592889308929443, "logits/rejected": -1.8592889308929443, "logps/chosen": -29.85013198852539, "logps/rejected": -29.85013198852539, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 3.5310680866241455, "rewards/margins": 0.0, "rewards/rejected": 3.5310680866241455, "step": 4775 }, { "epoch": 1.06, "learning_rate": 4.781405669205923e-06, "logits/chosen": -2.2403862476348877, "logits/rejected": -2.2381584644317627, "logps/chosen": -73.2270278930664, "logps/rejected": -59.11967468261719, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": 7.038930416107178, "rewards/margins": 3.1374213695526123, "rewards/rejected": 3.9015090465545654, "step": 4776 }, { "epoch": 1.06, "learning_rate": 4.779615068904328e-06, "logits/chosen": -1.6131858825683594, "logits/rejected": -1.6092207431793213, "logps/chosen": -19.04259490966797, "logps/rejected": -13.264778137207031, "loss": 0.6731, "rewards/accuracies": 0.0, "rewards/chosen": 0.7123878598213196, "rewards/margins": -0.3927925229072571, "rewards/rejected": 1.1051803827285767, "step": 4777 }, { "epoch": 1.06, "learning_rate": 4.777824496921668e-06, "logits/chosen": -2.237291097640991, "logits/rejected": -2.1803383827209473, "logps/chosen": -42.73475646972656, "logps/rejected": -40.6148681640625, "loss": 0.209, "rewards/accuracies": 1.0, "rewards/chosen": 2.5497682094573975, "rewards/margins": 1.075671911239624, "rewards/rejected": 1.4740962982177734, "step": 4778 }, { "epoch": 1.06, "learning_rate": 4.776033953488026e-06, "logits/chosen": -2.088693141937256, "logits/rejected": -2.0929148197174072, "logps/chosen": -31.00821304321289, "logps/rejected": -53.58232116699219, "loss": 0.2065, "rewards/accuracies": 1.0, "rewards/chosen": 3.4305737018585205, "rewards/margins": 0.6752047538757324, "rewards/rejected": 2.755368947982788, "step": 4779 }, { "epoch": 1.06, "learning_rate": 4.7742434388334815e-06, "logits/chosen": -1.7083652019500732, "logits/rejected": -1.7308424711227417, "logps/chosen": -44.745933532714844, "logps/rejected": -43.662864685058594, "loss": 0.742, "rewards/accuracies": 1.0, "rewards/chosen": 4.24993371963501, "rewards/margins": 0.5717926025390625, "rewards/rejected": 3.6781411170959473, "step": 4780 }, { "epoch": 1.06, "learning_rate": 4.772452953188114e-06, "logits/chosen": -1.924666166305542, "logits/rejected": -1.9056092500686646, "logps/chosen": -56.645301818847656, "logps/rejected": -58.94761657714844, "loss": 0.6382, "rewards/accuracies": 0.0, "rewards/chosen": 4.228663921356201, "rewards/margins": -0.34764862060546875, "rewards/rejected": 4.57631254196167, "step": 4781 }, { "epoch": 1.06, "learning_rate": 4.770662496781993e-06, "logits/chosen": -2.0758869647979736, "logits/rejected": -1.9904779195785522, "logps/chosen": -69.02159118652344, "logps/rejected": -35.42546463012695, "loss": 0.0809, "rewards/accuracies": 1.0, "rewards/chosen": 5.4072465896606445, "rewards/margins": 3.8608338832855225, "rewards/rejected": 1.546412706375122, "step": 4782 }, { "epoch": 1.06, "learning_rate": 4.768872069845191e-06, "logits/chosen": -1.7573611736297607, "logits/rejected": -1.7253170013427734, "logps/chosen": -99.8123550415039, "logps/rejected": -66.68687438964844, "loss": 0.1266, "rewards/accuracies": 1.0, "rewards/chosen": 6.050286293029785, "rewards/margins": 2.654832601547241, "rewards/rejected": 3.395453691482544, "step": 4783 }, { "epoch": 1.06, "learning_rate": 4.7670816726077725e-06, "logits/chosen": -1.718931794166565, "logits/rejected": -1.780084490776062, "logps/chosen": -53.05203628540039, "logps/rejected": -87.08656311035156, "loss": 1.2135, "rewards/accuracies": 0.0, "rewards/chosen": 5.007730007171631, "rewards/margins": -2.2349443435668945, "rewards/rejected": 7.242674350738525, "step": 4784 }, { "epoch": 1.06, "learning_rate": 4.7652913052998e-06, "logits/chosen": -2.0392885208129883, "logits/rejected": -1.967448353767395, "logps/chosen": -89.48129272460938, "logps/rejected": -53.28596878051758, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": 7.205522060394287, "rewards/margins": 4.935844421386719, "rewards/rejected": 2.2696774005889893, "step": 4785 }, { "epoch": 1.06, "learning_rate": 4.763500968151329e-06, "logits/chosen": -1.7886687517166138, "logits/rejected": -1.7861748933792114, "logps/chosen": -28.888076782226562, "logps/rejected": -60.68465042114258, "loss": 0.3044, "rewards/accuracies": 1.0, "rewards/chosen": 3.885319471359253, "rewards/margins": 0.2657320499420166, "rewards/rejected": 3.6195874214172363, "step": 4786 }, { "epoch": 1.06, "learning_rate": 4.761710661392416e-06, "logits/chosen": -1.8549796342849731, "logits/rejected": -1.8369966745376587, "logps/chosen": -69.26441955566406, "logps/rejected": -52.65666961669922, "loss": 0.2103, "rewards/accuracies": 1.0, "rewards/chosen": 2.659442901611328, "rewards/margins": 0.6553535461425781, "rewards/rejected": 2.00408935546875, "step": 4787 }, { "epoch": 1.06, "learning_rate": 4.759920385253111e-06, "logits/chosen": -2.067944288253784, "logits/rejected": -2.0761187076568604, "logps/chosen": -55.61329650878906, "logps/rejected": -101.29667663574219, "loss": 0.2696, "rewards/accuracies": 1.0, "rewards/chosen": 6.459104061126709, "rewards/margins": 1.4905509948730469, "rewards/rejected": 4.968553066253662, "step": 4788 }, { "epoch": 1.06, "learning_rate": 4.758130139963456e-06, "logits/chosen": -1.9002940654754639, "logits/rejected": -1.8844712972640991, "logps/chosen": -54.920074462890625, "logps/rejected": -63.67292022705078, "loss": 0.35, "rewards/accuracies": 1.0, "rewards/chosen": 4.233894348144531, "rewards/margins": 0.7495033740997314, "rewards/rejected": 3.4843909740448, "step": 4789 }, { "epoch": 1.06, "learning_rate": 4.756339925753501e-06, "logits/chosen": -1.8887020349502563, "logits/rejected": -1.8849201202392578, "logps/chosen": -41.74289321899414, "logps/rejected": -56.61534118652344, "loss": 0.3004, "rewards/accuracies": 1.0, "rewards/chosen": 3.3012301921844482, "rewards/margins": 0.19740939140319824, "rewards/rejected": 3.10382080078125, "step": 4790 }, { "epoch": 1.06, "learning_rate": 4.754549742853276e-06, "logits/chosen": -1.6551929712295532, "logits/rejected": -1.6943727731704712, "logps/chosen": -79.14451599121094, "logps/rejected": -59.25933074951172, "loss": 0.1731, "rewards/accuracies": 1.0, "rewards/chosen": 4.993202209472656, "rewards/margins": 0.998769998550415, "rewards/rejected": 3.994432210922241, "step": 4791 }, { "epoch": 1.06, "learning_rate": 4.752759591492821e-06, "logits/chosen": -1.9940812587738037, "logits/rejected": -1.9905580282211304, "logps/chosen": -47.5860481262207, "logps/rejected": -78.26914978027344, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": 4.527404308319092, "rewards/margins": 0.7995431423187256, "rewards/rejected": 3.727861166000366, "step": 4792 }, { "epoch": 1.06, "learning_rate": 4.750969471902164e-06, "logits/chosen": -1.9761059284210205, "logits/rejected": -1.9810919761657715, "logps/chosen": -80.42650604248047, "logps/rejected": -47.08203125, "loss": 0.2157, "rewards/accuracies": 1.0, "rewards/chosen": 5.044361114501953, "rewards/margins": 0.6971564292907715, "rewards/rejected": 4.347204685211182, "step": 4793 }, { "epoch": 1.06, "learning_rate": 4.749179384311331e-06, "logits/chosen": -1.9193698167800903, "logits/rejected": -1.805537462234497, "logps/chosen": -112.2366943359375, "logps/rejected": -35.53865051269531, "loss": 0.6373, "rewards/accuracies": 1.0, "rewards/chosen": 6.765844821929932, "rewards/margins": 2.110395908355713, "rewards/rejected": 4.655448913574219, "step": 4794 }, { "epoch": 1.06, "learning_rate": 4.747389328950343e-06, "logits/chosen": -2.1014668941497803, "logits/rejected": -2.060469388961792, "logps/chosen": -69.52994537353516, "logps/rejected": -40.95724105834961, "loss": 0.2044, "rewards/accuracies": 1.0, "rewards/chosen": 3.461118459701538, "rewards/margins": 0.7763783931732178, "rewards/rejected": 2.6847400665283203, "step": 4795 }, { "epoch": 1.06, "learning_rate": 4.745599306049221e-06, "logits/chosen": -1.8741527795791626, "logits/rejected": -1.6590406894683838, "logps/chosen": -191.28533935546875, "logps/rejected": -22.913236618041992, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 6.489386081695557, "rewards/margins": 6.311313629150391, "rewards/rejected": 0.17807236313819885, "step": 4796 }, { "epoch": 1.06, "learning_rate": 4.743809315837977e-06, "logits/chosen": -1.5710573196411133, "logits/rejected": -1.5327025651931763, "logps/chosen": -61.16227722167969, "logps/rejected": -46.63248825073242, "loss": 0.098, "rewards/accuracies": 1.0, "rewards/chosen": 3.841691732406616, "rewards/margins": 1.5651791095733643, "rewards/rejected": 2.276512622833252, "step": 4797 }, { "epoch": 1.06, "learning_rate": 4.742019358546617e-06, "logits/chosen": -1.94777512550354, "logits/rejected": -1.9580073356628418, "logps/chosen": -17.04537010192871, "logps/rejected": -61.5233268737793, "loss": 0.4833, "rewards/accuracies": 0.0, "rewards/chosen": 2.3776445388793945, "rewards/margins": -0.4491298198699951, "rewards/rejected": 2.8267743587493896, "step": 4798 }, { "epoch": 1.06, "learning_rate": 4.740229434405153e-06, "logits/chosen": -1.7486873865127563, "logits/rejected": -1.6273655891418457, "logps/chosen": -111.4803237915039, "logps/rejected": -57.02074432373047, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": 5.5225348472595215, "rewards/margins": 2.1674559116363525, "rewards/rejected": 3.355078935623169, "step": 4799 }, { "epoch": 1.06, "learning_rate": 4.738439543643578e-06, "logits/chosen": -1.951509714126587, "logits/rejected": -1.9114341735839844, "logps/chosen": -96.68408203125, "logps/rejected": -153.12185668945312, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 11.841005325317383, "rewards/margins": 3.5340213775634766, "rewards/rejected": 8.306983947753906, "step": 4800 }, { "epoch": 1.06, "learning_rate": 4.736649686491896e-06, "logits/chosen": -1.8274602890014648, "logits/rejected": -1.8295433521270752, "logps/chosen": -35.285850524902344, "logps/rejected": -55.36027908325195, "loss": 0.6498, "rewards/accuracies": 0.0, "rewards/chosen": 3.081174612045288, "rewards/margins": -0.8769075870513916, "rewards/rejected": 3.9580821990966797, "step": 4801 }, { "epoch": 1.06, "learning_rate": 4.734859863180095e-06, "logits/chosen": -2.078549861907959, "logits/rejected": -1.926598310470581, "logps/chosen": -136.665283203125, "logps/rejected": -90.19017028808594, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 7.856982707977295, "rewards/margins": 4.755419731140137, "rewards/rejected": 3.1015632152557373, "step": 4802 }, { "epoch": 1.06, "learning_rate": 4.733070073938166e-06, "logits/chosen": -1.8443646430969238, "logits/rejected": -1.726691484451294, "logps/chosen": -98.28034973144531, "logps/rejected": -7.95612096786499, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": 3.718592882156372, "rewards/margins": 2.947929620742798, "rewards/rejected": 0.770663321018219, "step": 4803 }, { "epoch": 1.06, "learning_rate": 4.731280318996088e-06, "logits/chosen": -2.234952688217163, "logits/rejected": -2.2136716842651367, "logps/chosen": -131.07801818847656, "logps/rejected": -47.92331314086914, "loss": 0.0798, "rewards/accuracies": 1.0, "rewards/chosen": 9.516298294067383, "rewards/margins": 3.0850024223327637, "rewards/rejected": 6.431295871734619, "step": 4804 }, { "epoch": 1.06, "learning_rate": 4.729490598583845e-06, "logits/chosen": -1.8916152715682983, "logits/rejected": -1.7392277717590332, "logps/chosen": -50.73435592651367, "logps/rejected": -13.616764068603516, "loss": 0.5235, "rewards/accuracies": 1.0, "rewards/chosen": 5.49669075012207, "rewards/margins": 4.278037071228027, "rewards/rejected": 1.2186535596847534, "step": 4805 }, { "epoch": 1.06, "learning_rate": 4.727700912931408e-06, "logits/chosen": -1.7486021518707275, "logits/rejected": -1.7284514904022217, "logps/chosen": -36.431861877441406, "logps/rejected": -78.673828125, "loss": 0.203, "rewards/accuracies": 1.0, "rewards/chosen": 2.4218711853027344, "rewards/margins": 1.025852918624878, "rewards/rejected": 1.3960182666778564, "step": 4806 }, { "epoch": 1.06, "learning_rate": 4.725911262268751e-06, "logits/chosen": -1.9916149377822876, "logits/rejected": -1.8657410144805908, "logps/chosen": -120.46045684814453, "logps/rejected": -33.373374938964844, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 5.864867687225342, "rewards/margins": 4.435884475708008, "rewards/rejected": 1.4289829730987549, "step": 4807 }, { "epoch": 1.06, "learning_rate": 4.724121646825838e-06, "logits/chosen": -1.5605071783065796, "logits/rejected": -1.5605071783065796, "logps/chosen": -23.4616641998291, "logps/rejected": -23.4616641998291, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 3.734276533126831, "rewards/margins": 0.0, "rewards/rejected": 3.734276533126831, "step": 4808 }, { "epoch": 1.06, "learning_rate": 4.722332066832629e-06, "logits/chosen": -2.0016438961029053, "logits/rejected": -1.9456866979599, "logps/chosen": -84.15157318115234, "logps/rejected": -52.23526382446289, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 6.971790313720703, "rewards/margins": 3.023324966430664, "rewards/rejected": 3.948465347290039, "step": 4809 }, { "epoch": 1.06, "learning_rate": 4.720542522519081e-06, "logits/chosen": -1.8364899158477783, "logits/rejected": -1.8399654626846313, "logps/chosen": -70.62266540527344, "logps/rejected": -149.2379150390625, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": 10.599316596984863, "rewards/margins": 3.1127123832702637, "rewards/rejected": 7.4866042137146, "step": 4810 }, { "epoch": 1.06, "learning_rate": 4.718753014115149e-06, "logits/chosen": -1.9993830919265747, "logits/rejected": -2.0185091495513916, "logps/chosen": -29.82732391357422, "logps/rejected": -79.54791259765625, "loss": 0.3219, "rewards/accuracies": 1.0, "rewards/chosen": 3.4021224975585938, "rewards/margins": 0.33823466300964355, "rewards/rejected": 3.06388783454895, "step": 4811 }, { "epoch": 1.07, "learning_rate": 4.716963541850779e-06, "logits/chosen": -1.7565923929214478, "logits/rejected": -1.7649080753326416, "logps/chosen": -113.23283386230469, "logps/rejected": -152.52120971679688, "loss": 0.0548, "rewards/accuracies": 1.0, "rewards/chosen": 12.702000617980957, "rewards/margins": 4.162370681762695, "rewards/rejected": 8.539629936218262, "step": 4812 }, { "epoch": 1.07, "learning_rate": 4.715174105955911e-06, "logits/chosen": -1.58857262134552, "logits/rejected": -1.4800500869750977, "logps/chosen": -79.17371368408203, "logps/rejected": -12.211851119995117, "loss": 0.1398, "rewards/accuracies": 1.0, "rewards/chosen": 3.293572187423706, "rewards/margins": 2.6356847286224365, "rewards/rejected": 0.6578874588012695, "step": 4813 }, { "epoch": 1.07, "learning_rate": 4.713384706660487e-06, "logits/chosen": -1.8421999216079712, "logits/rejected": -1.8353239297866821, "logps/chosen": -23.898456573486328, "logps/rejected": -47.75893783569336, "loss": 0.7319, "rewards/accuracies": 0.0, "rewards/chosen": 2.0891494750976562, "rewards/margins": -1.1826679706573486, "rewards/rejected": 3.271817445755005, "step": 4814 }, { "epoch": 1.07, "learning_rate": 4.711595344194438e-06, "logits/chosen": -2.1732840538024902, "logits/rejected": -1.804050087928772, "logps/chosen": -43.65950012207031, "logps/rejected": -73.99322509765625, "loss": 0.7641, "rewards/accuracies": 1.0, "rewards/chosen": 4.289583683013916, "rewards/margins": 0.6801857948303223, "rewards/rejected": 3.6093978881835938, "step": 4815 }, { "epoch": 1.07, "learning_rate": 4.709806018787696e-06, "logits/chosen": -1.975917935371399, "logits/rejected": -1.9532690048217773, "logps/chosen": -54.06968688964844, "logps/rejected": -62.522212982177734, "loss": 2.2717, "rewards/accuracies": 1.0, "rewards/chosen": 4.752742290496826, "rewards/margins": 0.74713134765625, "rewards/rejected": 4.005610942840576, "step": 4816 }, { "epoch": 1.07, "learning_rate": 4.708016730670183e-06, "logits/chosen": -1.8291035890579224, "logits/rejected": -1.82797110080719, "logps/chosen": -118.24797058105469, "logps/rejected": -92.17372131347656, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": 12.805041313171387, "rewards/margins": 3.8193283081054688, "rewards/rejected": 8.985713005065918, "step": 4817 }, { "epoch": 1.07, "learning_rate": 4.706227480071818e-06, "logits/chosen": -1.6881204843521118, "logits/rejected": -1.6895617246627808, "logps/chosen": -18.800024032592773, "logps/rejected": -71.372314453125, "loss": 0.2091, "rewards/accuracies": 1.0, "rewards/chosen": 1.9403609037399292, "rewards/margins": 0.661585807800293, "rewards/rejected": 1.2787750959396362, "step": 4818 }, { "epoch": 1.07, "learning_rate": 4.704438267222515e-06, "logits/chosen": -1.882298469543457, "logits/rejected": -1.915374755859375, "logps/chosen": -55.83806228637695, "logps/rejected": -108.46316528320312, "loss": 0.1551, "rewards/accuracies": 1.0, "rewards/chosen": 8.748414993286133, "rewards/margins": 1.2952046394348145, "rewards/rejected": 7.453210353851318, "step": 4819 }, { "epoch": 1.07, "learning_rate": 4.702649092352183e-06, "logits/chosen": -1.7069264650344849, "logits/rejected": -1.7069264650344849, "logps/chosen": -31.41104507446289, "logps/rejected": -31.41104507446289, "loss": 0.4068, "rewards/accuracies": 0.0, "rewards/chosen": 3.5083706378936768, "rewards/margins": 0.0, "rewards/rejected": 3.5083706378936768, "step": 4820 }, { "epoch": 1.07, "learning_rate": 4.700859955690731e-06, "logits/chosen": -2.0475082397460938, "logits/rejected": -1.3847192525863647, "logps/chosen": -53.87662124633789, "logps/rejected": -67.84648132324219, "loss": 0.1707, "rewards/accuracies": 1.0, "rewards/chosen": 7.102180480957031, "rewards/margins": 1.53068208694458, "rewards/rejected": 5.571498394012451, "step": 4821 }, { "epoch": 1.07, "learning_rate": 4.699070857468052e-06, "logits/chosen": -1.9814332723617554, "logits/rejected": -1.9700225591659546, "logps/chosen": -39.65086364746094, "logps/rejected": -52.88824462890625, "loss": 0.5743, "rewards/accuracies": 0.0, "rewards/chosen": 4.072563171386719, "rewards/margins": -0.7247109413146973, "rewards/rejected": 4.797274112701416, "step": 4822 }, { "epoch": 1.07, "learning_rate": 4.697281797914046e-06, "logits/chosen": -2.0392093658447266, "logits/rejected": -1.9944746494293213, "logps/chosen": -115.32936096191406, "logps/rejected": -52.632240295410156, "loss": 0.1182, "rewards/accuracies": 1.0, "rewards/chosen": 8.124406814575195, "rewards/margins": 3.5772461891174316, "rewards/rejected": 4.547160625457764, "step": 4823 }, { "epoch": 1.07, "learning_rate": 4.6954927772586e-06, "logits/chosen": -2.0168871879577637, "logits/rejected": -2.0288214683532715, "logps/chosen": -48.428470611572266, "logps/rejected": -61.06563186645508, "loss": 0.6062, "rewards/accuracies": 0.0, "rewards/chosen": 7.228007793426514, "rewards/margins": -0.8373627662658691, "rewards/rejected": 8.065370559692383, "step": 4824 }, { "epoch": 1.07, "learning_rate": 4.693703795731603e-06, "logits/chosen": -1.7823132276535034, "logits/rejected": -1.6167645454406738, "logps/chosen": -68.39508056640625, "logps/rejected": -10.85107707977295, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": 3.030747175216675, "rewards/margins": 1.8715975284576416, "rewards/rejected": 1.1591496467590332, "step": 4825 }, { "epoch": 1.07, "learning_rate": 4.691914853562929e-06, "logits/chosen": -1.6850495338439941, "logits/rejected": -1.645878553390503, "logps/chosen": -20.962913513183594, "logps/rejected": -32.145423889160156, "loss": 0.66, "rewards/accuracies": 0.0, "rewards/chosen": 3.407827854156494, "rewards/margins": -0.9378256797790527, "rewards/rejected": 4.345653533935547, "step": 4826 }, { "epoch": 1.07, "learning_rate": 4.690125950982458e-06, "logits/chosen": -1.9260936975479126, "logits/rejected": -1.9196293354034424, "logps/chosen": -77.1343994140625, "logps/rejected": -53.580322265625, "loss": 0.3972, "rewards/accuracies": 0.0, "rewards/chosen": 5.333152770996094, "rewards/margins": -0.1668701171875, "rewards/rejected": 5.500022888183594, "step": 4827 }, { "epoch": 1.07, "learning_rate": 4.688337088220058e-06, "logits/chosen": -1.9247595071792603, "logits/rejected": -1.8218052387237549, "logps/chosen": -89.90679931640625, "logps/rejected": -61.16375732421875, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 5.904597759246826, "rewards/margins": 3.79520583152771, "rewards/rejected": 2.109391927719116, "step": 4828 }, { "epoch": 1.07, "learning_rate": 4.68654826550559e-06, "logits/chosen": -1.804487943649292, "logits/rejected": -1.7143959999084473, "logps/chosen": -101.6200942993164, "logps/rejected": -40.50636291503906, "loss": 0.1885, "rewards/accuracies": 1.0, "rewards/chosen": 4.861608028411865, "rewards/margins": 1.7666678428649902, "rewards/rejected": 3.094940185546875, "step": 4829 }, { "epoch": 1.07, "learning_rate": 4.684759483068918e-06, "logits/chosen": -1.5977426767349243, "logits/rejected": -1.575221300125122, "logps/chosen": -26.268741607666016, "logps/rejected": -42.82762908935547, "loss": 0.3119, "rewards/accuracies": 1.0, "rewards/chosen": 3.1501033306121826, "rewards/margins": 0.34775805473327637, "rewards/rejected": 2.8023452758789062, "step": 4830 }, { "epoch": 1.07, "learning_rate": 4.682970741139893e-06, "logits/chosen": -1.7133663892745972, "logits/rejected": -1.6994925737380981, "logps/chosen": -41.698822021484375, "logps/rejected": -74.49164581298828, "loss": 0.4998, "rewards/accuracies": 1.0, "rewards/chosen": 4.280673503875732, "rewards/margins": 1.0252130031585693, "rewards/rejected": 3.255460500717163, "step": 4831 }, { "epoch": 1.07, "learning_rate": 4.681182039948368e-06, "logits/chosen": -1.7878447771072388, "logits/rejected": -1.5717918872833252, "logps/chosen": -137.2874755859375, "logps/rejected": -38.29315185546875, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 9.8802490234375, "rewards/margins": 7.251611232757568, "rewards/rejected": 2.6286377906799316, "step": 4832 }, { "epoch": 1.07, "learning_rate": 4.679393379724181e-06, "logits/chosen": -1.9266963005065918, "logits/rejected": -1.8449904918670654, "logps/chosen": -52.165191650390625, "logps/rejected": -45.84455871582031, "loss": 0.3013, "rewards/accuracies": 1.0, "rewards/chosen": 3.904148817062378, "rewards/margins": 0.40856385231018066, "rewards/rejected": 3.4955849647521973, "step": 4833 }, { "epoch": 1.07, "learning_rate": 4.677604760697178e-06, "logits/chosen": -2.014892339706421, "logits/rejected": -1.9985759258270264, "logps/chosen": -93.24095153808594, "logps/rejected": -104.14781951904297, "loss": 0.0627, "rewards/accuracies": 1.0, "rewards/chosen": 8.753875732421875, "rewards/margins": 2.569307804107666, "rewards/rejected": 6.184567928314209, "step": 4834 }, { "epoch": 1.07, "learning_rate": 4.675816183097186e-06, "logits/chosen": -2.162079095840454, "logits/rejected": -2.1187946796417236, "logps/chosen": -47.994327545166016, "logps/rejected": -63.270545959472656, "loss": 0.1205, "rewards/accuracies": 1.0, "rewards/chosen": 5.332464218139648, "rewards/margins": 2.3205463886260986, "rewards/rejected": 3.01191782951355, "step": 4835 }, { "epoch": 1.07, "learning_rate": 4.674027647154037e-06, "logits/chosen": -1.8255733251571655, "logits/rejected": -1.7448145151138306, "logps/chosen": -87.77197265625, "logps/rejected": -57.44361877441406, "loss": 0.1064, "rewards/accuracies": 1.0, "rewards/chosen": 4.445974826812744, "rewards/margins": 1.600736379623413, "rewards/rejected": 2.845238447189331, "step": 4836 }, { "epoch": 1.07, "learning_rate": 4.6722391530975485e-06, "logits/chosen": -1.7307459115982056, "logits/rejected": -1.3029284477233887, "logps/chosen": -39.14462661743164, "logps/rejected": -25.64665412902832, "loss": 0.5065, "rewards/accuracies": 0.0, "rewards/chosen": 3.03776216506958, "rewards/margins": -0.34602856636047363, "rewards/rejected": 3.3837907314300537, "step": 4837 }, { "epoch": 1.07, "learning_rate": 4.670450701157544e-06, "logits/chosen": -1.8893458843231201, "logits/rejected": -1.937941551208496, "logps/chosen": -69.521728515625, "logps/rejected": -129.5681915283203, "loss": 0.4197, "rewards/accuracies": 0.0, "rewards/chosen": 7.063138008117676, "rewards/margins": -0.0197601318359375, "rewards/rejected": 7.082898139953613, "step": 4838 }, { "epoch": 1.07, "learning_rate": 4.668662291563832e-06, "logits/chosen": -1.8496317863464355, "logits/rejected": -1.8496317863464355, "logps/chosen": -39.939239501953125, "logps/rejected": -39.939239501953125, "loss": 0.3877, "rewards/accuracies": 0.0, "rewards/chosen": 4.3485307693481445, "rewards/margins": 0.0, "rewards/rejected": 4.3485307693481445, "step": 4839 }, { "epoch": 1.07, "learning_rate": 4.666873924546217e-06, "logits/chosen": -1.883015513420105, "logits/rejected": -1.7784559726715088, "logps/chosen": -161.1497802734375, "logps/rejected": -57.188316345214844, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": 6.872825622558594, "rewards/margins": 3.0614187717437744, "rewards/rejected": 3.8114068508148193, "step": 4840 }, { "epoch": 1.07, "learning_rate": 4.665085600334503e-06, "logits/chosen": -2.1036946773529053, "logits/rejected": -2.1036946773529053, "logps/chosen": -42.321929931640625, "logps/rejected": -42.321929931640625, "loss": 0.3606, "rewards/accuracies": 0.0, "rewards/chosen": 3.110144853591919, "rewards/margins": 0.0, "rewards/rejected": 3.110144853591919, "step": 4841 }, { "epoch": 1.07, "learning_rate": 4.663297319158483e-06, "logits/chosen": -2.054069995880127, "logits/rejected": -2.0105302333831787, "logps/chosen": -43.539283752441406, "logps/rejected": -14.647771835327148, "loss": 0.1266, "rewards/accuracies": 1.0, "rewards/chosen": 4.027635097503662, "rewards/margins": 1.4249844551086426, "rewards/rejected": 2.6026506423950195, "step": 4842 }, { "epoch": 1.07, "learning_rate": 4.66150908124795e-06, "logits/chosen": -1.9459811449050903, "logits/rejected": -1.9185101985931396, "logps/chosen": -78.31185913085938, "logps/rejected": -106.61085510253906, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 5.676976203918457, "rewards/margins": 2.448861837387085, "rewards/rejected": 3.228114366531372, "step": 4843 }, { "epoch": 1.07, "learning_rate": 4.659720886832686e-06, "logits/chosen": -1.9832698106765747, "logits/rejected": -1.9033629894256592, "logps/chosen": -55.390018463134766, "logps/rejected": -18.012958526611328, "loss": 0.3851, "rewards/accuracies": 1.0, "rewards/chosen": 4.770394802093506, "rewards/margins": 4.290096759796143, "rewards/rejected": 0.48029786348342896, "step": 4844 }, { "epoch": 1.07, "learning_rate": 4.65793273614247e-06, "logits/chosen": -2.0801172256469727, "logits/rejected": -2.0821378231048584, "logps/chosen": -96.32981872558594, "logps/rejected": -151.57359313964844, "loss": 0.186, "rewards/accuracies": 1.0, "rewards/chosen": 12.247535705566406, "rewards/margins": 3.0456066131591797, "rewards/rejected": 9.201929092407227, "step": 4845 }, { "epoch": 1.07, "learning_rate": 4.656144629407074e-06, "logits/chosen": -2.071176290512085, "logits/rejected": -1.9433366060256958, "logps/chosen": -45.00063705444336, "logps/rejected": -24.360248565673828, "loss": 0.3912, "rewards/accuracies": 1.0, "rewards/chosen": 3.734781265258789, "rewards/margins": 2.372410535812378, "rewards/rejected": 1.3623707294464111, "step": 4846 }, { "epoch": 1.07, "learning_rate": 4.654356566856268e-06, "logits/chosen": -1.7875598669052124, "logits/rejected": -1.7103763818740845, "logps/chosen": -44.20923614501953, "logps/rejected": -3.9626004695892334, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": 4.059388160705566, "rewards/margins": 2.618375539779663, "rewards/rejected": 1.4410126209259033, "step": 4847 }, { "epoch": 1.07, "learning_rate": 4.652568548719813e-06, "logits/chosen": -1.9035518169403076, "logits/rejected": -1.9185712337493896, "logps/chosen": -45.241493225097656, "logps/rejected": -47.118743896484375, "loss": 0.4007, "rewards/accuracies": 1.0, "rewards/chosen": 3.5061538219451904, "rewards/margins": 0.6492278575897217, "rewards/rejected": 2.8569259643554688, "step": 4848 }, { "epoch": 1.07, "learning_rate": 4.650780575227461e-06, "logits/chosen": -1.658737301826477, "logits/rejected": -1.6512776613235474, "logps/chosen": -76.81416320800781, "logps/rejected": -57.805152893066406, "loss": 0.7745, "rewards/accuracies": 1.0, "rewards/chosen": 3.3226821422576904, "rewards/margins": 1.179990291595459, "rewards/rejected": 2.1426918506622314, "step": 4849 }, { "epoch": 1.07, "learning_rate": 4.648992646608968e-06, "logits/chosen": -1.9671021699905396, "logits/rejected": -1.9202420711517334, "logps/chosen": -100.38166046142578, "logps/rejected": -62.59027862548828, "loss": 0.2062, "rewards/accuracies": 1.0, "rewards/chosen": 5.974271297454834, "rewards/margins": 0.8834066390991211, "rewards/rejected": 5.090864658355713, "step": 4850 }, { "epoch": 1.07, "learning_rate": 4.6472047630940755e-06, "logits/chosen": -1.9348591566085815, "logits/rejected": -1.8637268543243408, "logps/chosen": -92.24712371826172, "logps/rejected": -17.780696868896484, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": 4.727393627166748, "rewards/margins": 2.086555004119873, "rewards/rejected": 2.640838623046875, "step": 4851 }, { "epoch": 1.07, "learning_rate": 4.645416924912523e-06, "logits/chosen": -1.9994580745697021, "logits/rejected": -1.8673418760299683, "logps/chosen": -86.69197082519531, "logps/rejected": -21.478256225585938, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": 4.093100070953369, "rewards/margins": 4.174650192260742, "rewards/rejected": -0.0815502181649208, "step": 4852 }, { "epoch": 1.07, "learning_rate": 4.6436291322940425e-06, "logits/chosen": -1.7326093912124634, "logits/rejected": -1.633503794670105, "logps/chosen": -99.33244323730469, "logps/rejected": -56.835723876953125, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": 6.213540554046631, "rewards/margins": 2.875260829925537, "rewards/rejected": 3.3382797241210938, "step": 4853 }, { "epoch": 1.07, "learning_rate": 4.641841385468363e-06, "logits/chosen": -2.0282297134399414, "logits/rejected": -1.9547804594039917, "logps/chosen": -119.92845153808594, "logps/rejected": -86.94709777832031, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": 7.190940856933594, "rewards/margins": 3.080418109893799, "rewards/rejected": 4.110522747039795, "step": 4854 }, { "epoch": 1.07, "learning_rate": 4.640053684665203e-06, "logits/chosen": -1.666669249534607, "logits/rejected": -1.6338433027267456, "logps/chosen": -32.86283874511719, "logps/rejected": -55.57196807861328, "loss": 0.0609, "rewards/accuracies": 1.0, "rewards/chosen": 3.3806214332580566, "rewards/margins": 2.0859503746032715, "rewards/rejected": 1.2946709394454956, "step": 4855 }, { "epoch": 1.07, "learning_rate": 4.638266030114281e-06, "logits/chosen": -1.7584630250930786, "logits/rejected": -1.707297444343567, "logps/chosen": -98.05191802978516, "logps/rejected": -73.15643310546875, "loss": 0.9304, "rewards/accuracies": 1.0, "rewards/chosen": 5.867977142333984, "rewards/margins": 3.17098069190979, "rewards/rejected": 2.6969964504241943, "step": 4856 }, { "epoch": 1.08, "learning_rate": 4.636478422045302e-06, "logits/chosen": -1.8477891683578491, "logits/rejected": -1.716521143913269, "logps/chosen": -65.70326232910156, "logps/rejected": -14.314065933227539, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": 3.863729953765869, "rewards/margins": 3.5388259887695312, "rewards/rejected": 0.32490405440330505, "step": 4857 }, { "epoch": 1.08, "learning_rate": 4.634690860687974e-06, "logits/chosen": -1.7454768419265747, "logits/rejected": -1.7445000410079956, "logps/chosen": -25.41963768005371, "logps/rejected": -54.05815124511719, "loss": 0.3754, "rewards/accuracies": 1.0, "rewards/chosen": 2.815722942352295, "rewards/margins": 0.2407991886138916, "rewards/rejected": 2.5749237537384033, "step": 4858 }, { "epoch": 1.08, "learning_rate": 4.632903346271993e-06, "logits/chosen": -1.9447871446609497, "logits/rejected": -1.9447871446609497, "logps/chosen": -55.636695861816406, "logps/rejected": -55.636695861816406, "loss": 0.353, "rewards/accuracies": 0.0, "rewards/chosen": 4.024487495422363, "rewards/margins": 0.0, "rewards/rejected": 4.024487495422363, "step": 4859 }, { "epoch": 1.08, "learning_rate": 4.631115879027048e-06, "logits/chosen": -1.7746753692626953, "logits/rejected": -1.7522294521331787, "logps/chosen": -52.90245819091797, "logps/rejected": -74.68215942382812, "loss": 0.2055, "rewards/accuracies": 1.0, "rewards/chosen": 3.3491127490997314, "rewards/margins": 0.9194450378417969, "rewards/rejected": 2.4296677112579346, "step": 4860 }, { "epoch": 1.08, "learning_rate": 4.629328459182827e-06, "logits/chosen": -1.959094524383545, "logits/rejected": -1.9151790142059326, "logps/chosen": -138.82290649414062, "logps/rejected": -31.0280818939209, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 6.345012187957764, "rewards/margins": 4.713656425476074, "rewards/rejected": 1.631355881690979, "step": 4861 }, { "epoch": 1.08, "learning_rate": 4.627541086969006e-06, "logits/chosen": -1.7426679134368896, "logits/rejected": -1.7426679134368896, "logps/chosen": -42.32488250732422, "logps/rejected": -42.32488250732422, "loss": 0.3739, "rewards/accuracies": 0.0, "rewards/chosen": 1.4587246179580688, "rewards/margins": 0.0, "rewards/rejected": 1.4587246179580688, "step": 4862 }, { "epoch": 1.08, "learning_rate": 4.6257537626152615e-06, "logits/chosen": -1.8451206684112549, "logits/rejected": -1.8451206684112549, "logps/chosen": -39.29908752441406, "logps/rejected": -39.29908752441406, "loss": 0.3492, "rewards/accuracies": 0.0, "rewards/chosen": 7.409577369689941, "rewards/margins": 0.0, "rewards/rejected": 7.409577369689941, "step": 4863 }, { "epoch": 1.08, "learning_rate": 4.623966486351257e-06, "logits/chosen": -1.7940582036972046, "logits/rejected": -1.7121609449386597, "logps/chosen": -63.67935562133789, "logps/rejected": -19.5054874420166, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": 3.705296039581299, "rewards/margins": 1.808221697807312, "rewards/rejected": 1.8970743417739868, "step": 4864 }, { "epoch": 1.08, "learning_rate": 4.6221792584066575e-06, "logits/chosen": -2.1116929054260254, "logits/rejected": -2.1264801025390625, "logps/chosen": -46.96259307861328, "logps/rejected": -72.12188720703125, "loss": 0.248, "rewards/accuracies": 1.0, "rewards/chosen": 4.8650078773498535, "rewards/margins": 0.4700889587402344, "rewards/rejected": 4.394918918609619, "step": 4865 }, { "epoch": 1.08, "learning_rate": 4.620392079011113e-06, "logits/chosen": -2.022286891937256, "logits/rejected": -1.6501208543777466, "logps/chosen": -24.947376251220703, "logps/rejected": -21.106412887573242, "loss": 0.2652, "rewards/accuracies": 1.0, "rewards/chosen": 2.7975680828094482, "rewards/margins": 0.7231519222259521, "rewards/rejected": 2.074416160583496, "step": 4866 }, { "epoch": 1.08, "learning_rate": 4.618604948394275e-06, "logits/chosen": -2.309926986694336, "logits/rejected": -2.332151412963867, "logps/chosen": -56.78270721435547, "logps/rejected": -80.76239013671875, "loss": 0.932, "rewards/accuracies": 0.0, "rewards/chosen": 5.411059856414795, "rewards/margins": -1.6949748992919922, "rewards/rejected": 7.106034755706787, "step": 4867 }, { "epoch": 1.08, "learning_rate": 4.616817866785784e-06, "logits/chosen": -1.8435401916503906, "logits/rejected": -1.8016257286071777, "logps/chosen": -29.49943733215332, "logps/rejected": -48.46190643310547, "loss": 0.6882, "rewards/accuracies": 0.0, "rewards/chosen": 1.9684003591537476, "rewards/margins": -0.07559525966644287, "rewards/rejected": 2.0439956188201904, "step": 4868 }, { "epoch": 1.08, "learning_rate": 4.615030834415277e-06, "logits/chosen": -1.682989239692688, "logits/rejected": -1.682989239692688, "logps/chosen": -39.00605773925781, "logps/rejected": -39.00605773925781, "loss": 0.4035, "rewards/accuracies": 0.0, "rewards/chosen": 6.93801736831665, "rewards/margins": 0.0, "rewards/rejected": 6.93801736831665, "step": 4869 }, { "epoch": 1.08, "learning_rate": 4.613243851512381e-06, "logits/chosen": -1.9249036312103271, "logits/rejected": -1.8782597780227661, "logps/chosen": -86.02281188964844, "logps/rejected": -91.84538269042969, "loss": 0.1542, "rewards/accuracies": 1.0, "rewards/chosen": 5.81716775894165, "rewards/margins": 2.866422414779663, "rewards/rejected": 2.9507453441619873, "step": 4870 }, { "epoch": 1.08, "learning_rate": 4.6114569183067195e-06, "logits/chosen": -1.955743432044983, "logits/rejected": -1.9249495267868042, "logps/chosen": -94.9228515625, "logps/rejected": -74.52098083496094, "loss": 0.1264, "rewards/accuracies": 1.0, "rewards/chosen": 4.571155071258545, "rewards/margins": 1.324936866760254, "rewards/rejected": 3.246218204498291, "step": 4871 }, { "epoch": 1.08, "learning_rate": 4.609670035027912e-06, "logits/chosen": -2.1166579723358154, "logits/rejected": -2.0890355110168457, "logps/chosen": -90.861572265625, "logps/rejected": -66.00894165039062, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 6.33843994140625, "rewards/margins": 2.2172818183898926, "rewards/rejected": 4.121158123016357, "step": 4872 }, { "epoch": 1.08, "learning_rate": 4.6078832019055645e-06, "logits/chosen": -1.7641479969024658, "logits/rejected": -1.7485363483428955, "logps/chosen": -141.30899047851562, "logps/rejected": -122.93663024902344, "loss": 0.1101, "rewards/accuracies": 1.0, "rewards/chosen": 10.658342361450195, "rewards/margins": 1.8236818313598633, "rewards/rejected": 8.834660530090332, "step": 4873 }, { "epoch": 1.08, "learning_rate": 4.606096419169285e-06, "logits/chosen": -2.065420389175415, "logits/rejected": -2.050079345703125, "logps/chosen": -62.63577651977539, "logps/rejected": -73.77648162841797, "loss": 0.2318, "rewards/accuracies": 1.0, "rewards/chosen": 4.145168781280518, "rewards/margins": 0.5289890766143799, "rewards/rejected": 3.6161797046661377, "step": 4874 }, { "epoch": 1.08, "learning_rate": 4.604309687048667e-06, "logits/chosen": -2.095675468444824, "logits/rejected": -2.097256898880005, "logps/chosen": -94.66136169433594, "logps/rejected": -104.93413543701172, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 8.078299522399902, "rewards/margins": 3.153262138366699, "rewards/rejected": 4.925037384033203, "step": 4875 }, { "epoch": 1.08, "learning_rate": 4.6025230057733045e-06, "logits/chosen": -2.2319509983062744, "logits/rejected": -2.22369122505188, "logps/chosen": -40.76231002807617, "logps/rejected": -55.867286682128906, "loss": 0.0828, "rewards/accuracies": 1.0, "rewards/chosen": 5.112034320831299, "rewards/margins": 2.1001689434051514, "rewards/rejected": 3.0118653774261475, "step": 4876 }, { "epoch": 1.08, "learning_rate": 4.60073637557278e-06, "logits/chosen": -1.9344364404678345, "logits/rejected": -1.9344364404678345, "logps/chosen": -49.998844146728516, "logps/rejected": -49.998844146728516, "loss": 0.4413, "rewards/accuracies": 0.0, "rewards/chosen": 8.831855773925781, "rewards/margins": 0.0, "rewards/rejected": 8.831855773925781, "step": 4877 }, { "epoch": 1.08, "learning_rate": 4.598949796676672e-06, "logits/chosen": -1.6608518362045288, "logits/rejected": -1.6361552476882935, "logps/chosen": -27.397846221923828, "logps/rejected": -19.851438522338867, "loss": 0.3486, "rewards/accuracies": 0.0, "rewards/chosen": 1.7784550189971924, "rewards/margins": -0.005990028381347656, "rewards/rejected": 1.78444504737854, "step": 4878 }, { "epoch": 1.08, "learning_rate": 4.597163269314551e-06, "logits/chosen": -1.7616840600967407, "logits/rejected": -1.7115426063537598, "logps/chosen": -35.6252555847168, "logps/rejected": -43.88508605957031, "loss": 0.2086, "rewards/accuracies": 1.0, "rewards/chosen": 3.548401355743408, "rewards/margins": 1.1489064693450928, "rewards/rejected": 2.3994948863983154, "step": 4879 }, { "epoch": 1.08, "learning_rate": 4.595376793715978e-06, "logits/chosen": -2.215689182281494, "logits/rejected": -2.1857216358184814, "logps/chosen": -57.3720588684082, "logps/rejected": -71.80467987060547, "loss": 2.9395, "rewards/accuracies": 1.0, "rewards/chosen": 4.691941738128662, "rewards/margins": 2.053285837173462, "rewards/rejected": 2.6386559009552, "step": 4880 }, { "epoch": 1.08, "learning_rate": 4.593590370110519e-06, "logits/chosen": -1.5461169481277466, "logits/rejected": -1.3914915323257446, "logps/chosen": -43.85255813598633, "logps/rejected": -12.176300048828125, "loss": 0.1918, "rewards/accuracies": 1.0, "rewards/chosen": 3.6303157806396484, "rewards/margins": 2.831700563430786, "rewards/rejected": 0.7986152768135071, "step": 4881 }, { "epoch": 1.08, "learning_rate": 4.591803998727717e-06, "logits/chosen": -1.9221315383911133, "logits/rejected": -1.8741450309753418, "logps/chosen": -58.27996063232422, "logps/rejected": -41.9987678527832, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 2.96785044670105, "rewards/margins": 2.050220489501953, "rewards/rejected": 0.9176300168037415, "step": 4882 }, { "epoch": 1.08, "learning_rate": 4.590017679797122e-06, "logits/chosen": -1.8926299810409546, "logits/rejected": -1.9299582242965698, "logps/chosen": -78.44406127929688, "logps/rejected": -138.43466186523438, "loss": 0.2273, "rewards/accuracies": 1.0, "rewards/chosen": 8.500140190124512, "rewards/margins": 1.1085386276245117, "rewards/rejected": 7.3916015625, "step": 4883 }, { "epoch": 1.08, "learning_rate": 4.588231413548268e-06, "logits/chosen": -2.000091314315796, "logits/rejected": -1.995826005935669, "logps/chosen": -56.06315612792969, "logps/rejected": -195.8995361328125, "loss": 0.2004, "rewards/accuracies": 1.0, "rewards/chosen": 6.824737548828125, "rewards/margins": 1.8880400657653809, "rewards/rejected": 4.936697483062744, "step": 4884 }, { "epoch": 1.08, "learning_rate": 4.586445200210689e-06, "logits/chosen": -1.8297655582427979, "logits/rejected": -1.822934627532959, "logps/chosen": -26.65594482421875, "logps/rejected": -66.13330078125, "loss": 1.8512, "rewards/accuracies": 0.0, "rewards/chosen": 3.1389729976654053, "rewards/margins": -3.5939342975616455, "rewards/rejected": 6.732907295227051, "step": 4885 }, { "epoch": 1.08, "learning_rate": 4.5846590400139066e-06, "logits/chosen": -2.0614917278289795, "logits/rejected": -2.0614917278289795, "logps/chosen": -11.547808647155762, "logps/rejected": -11.547808647155762, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.4824169874191284, "rewards/margins": 0.0, "rewards/rejected": 1.4824169874191284, "step": 4886 }, { "epoch": 1.08, "learning_rate": 4.58287293318744e-06, "logits/chosen": -1.8169552087783813, "logits/rejected": -1.7978039979934692, "logps/chosen": -61.815956115722656, "logps/rejected": -57.08184814453125, "loss": 0.499, "rewards/accuracies": 1.0, "rewards/chosen": 3.5098960399627686, "rewards/margins": 1.1186513900756836, "rewards/rejected": 2.391244649887085, "step": 4887 }, { "epoch": 1.08, "learning_rate": 4.5810868799607975e-06, "logits/chosen": -1.7712491750717163, "logits/rejected": -1.7501494884490967, "logps/chosen": -119.91649627685547, "logps/rejected": -42.48558044433594, "loss": 0.1236, "rewards/accuracies": 1.0, "rewards/chosen": 4.841556549072266, "rewards/margins": 1.8892974853515625, "rewards/rejected": 2.952259063720703, "step": 4888 }, { "epoch": 1.08, "learning_rate": 4.579300880563485e-06, "logits/chosen": -1.774916648864746, "logits/rejected": -1.774916648864746, "logps/chosen": -58.91014862060547, "logps/rejected": -58.91014862060547, "loss": 0.4526, "rewards/accuracies": 0.0, "rewards/chosen": 5.2108683586120605, "rewards/margins": 0.0, "rewards/rejected": 5.2108683586120605, "step": 4889 }, { "epoch": 1.08, "learning_rate": 4.5775149352249985e-06, "logits/chosen": -1.7337095737457275, "logits/rejected": -1.7303105592727661, "logps/chosen": -47.53157043457031, "logps/rejected": -59.636661529541016, "loss": 0.9003, "rewards/accuracies": 0.0, "rewards/chosen": 3.8600900173187256, "rewards/margins": -1.5578882694244385, "rewards/rejected": 5.417978286743164, "step": 4890 }, { "epoch": 1.08, "learning_rate": 4.575729044174825e-06, "logits/chosen": -2.1848645210266113, "logits/rejected": -2.155292272567749, "logps/chosen": -81.59820556640625, "logps/rejected": -54.688934326171875, "loss": 0.1665, "rewards/accuracies": 1.0, "rewards/chosen": 6.750907897949219, "rewards/margins": 1.1120681762695312, "rewards/rejected": 5.6388397216796875, "step": 4891 }, { "epoch": 1.08, "learning_rate": 4.573943207642452e-06, "logits/chosen": -1.9482530355453491, "logits/rejected": -2.0125157833099365, "logps/chosen": -71.57435607910156, "logps/rejected": -128.7635498046875, "loss": 0.397, "rewards/accuracies": 1.0, "rewards/chosen": 12.12329387664795, "rewards/margins": 0.03525829315185547, "rewards/rejected": 12.088035583496094, "step": 4892 }, { "epoch": 1.08, "learning_rate": 4.572157425857351e-06, "logits/chosen": -2.2632241249084473, "logits/rejected": -2.225674867630005, "logps/chosen": -115.47743225097656, "logps/rejected": -33.72258377075195, "loss": 0.1464, "rewards/accuracies": 1.0, "rewards/chosen": 6.800703525543213, "rewards/margins": 5.845174312591553, "rewards/rejected": 0.9555293917655945, "step": 4893 }, { "epoch": 1.08, "learning_rate": 4.570371699048992e-06, "logits/chosen": -1.9042675495147705, "logits/rejected": -1.8358746767044067, "logps/chosen": -87.35005187988281, "logps/rejected": -96.2318115234375, "loss": 0.0867, "rewards/accuracies": 1.0, "rewards/chosen": 3.6233108043670654, "rewards/margins": 3.2724974155426025, "rewards/rejected": 0.3508132994174957, "step": 4894 }, { "epoch": 1.08, "learning_rate": 4.568586027446838e-06, "logits/chosen": -2.1413803100585938, "logits/rejected": -2.076200246810913, "logps/chosen": -46.99168014526367, "logps/rejected": -18.990861892700195, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": 3.6904942989349365, "rewards/margins": 2.7601351737976074, "rewards/rejected": 0.9303590655326843, "step": 4895 }, { "epoch": 1.08, "learning_rate": 4.5668004112803424e-06, "logits/chosen": -2.1199448108673096, "logits/rejected": -2.0780270099639893, "logps/chosen": -71.86085510253906, "logps/rejected": -52.621463775634766, "loss": 0.0838, "rewards/accuracies": 1.0, "rewards/chosen": 5.647334575653076, "rewards/margins": 2.674426794052124, "rewards/rejected": 2.972907781600952, "step": 4896 }, { "epoch": 1.08, "learning_rate": 4.565014850778951e-06, "logits/chosen": -1.7656128406524658, "logits/rejected": -1.8804265260696411, "logps/chosen": -34.925357818603516, "logps/rejected": -89.86293029785156, "loss": 2.2471, "rewards/accuracies": 0.0, "rewards/chosen": 5.333054065704346, "rewards/margins": -4.2981858253479, "rewards/rejected": 9.631239891052246, "step": 4897 }, { "epoch": 1.08, "learning_rate": 4.563229346172107e-06, "logits/chosen": -1.7992310523986816, "logits/rejected": -1.753825306892395, "logps/chosen": -30.917720794677734, "logps/rejected": -57.60506057739258, "loss": 0.254, "rewards/accuracies": 1.0, "rewards/chosen": 2.9915249347686768, "rewards/margins": 0.442873477935791, "rewards/rejected": 2.5486514568328857, "step": 4898 }, { "epoch": 1.08, "learning_rate": 4.561443897689242e-06, "logits/chosen": -1.734885573387146, "logits/rejected": -1.805380940437317, "logps/chosen": -22.187557220458984, "logps/rejected": -59.990543365478516, "loss": 1.4802, "rewards/accuracies": 0.0, "rewards/chosen": 4.831727981567383, "rewards/margins": -2.6865034103393555, "rewards/rejected": 7.518231391906738, "step": 4899 }, { "epoch": 1.08, "learning_rate": 4.559658505559781e-06, "logits/chosen": -1.9329875707626343, "logits/rejected": -1.9166409969329834, "logps/chosen": -42.585601806640625, "logps/rejected": -90.89427185058594, "loss": 0.489, "rewards/accuracies": 1.0, "rewards/chosen": 2.64573073387146, "rewards/margins": 0.483278751373291, "rewards/rejected": 2.162451982498169, "step": 4900 }, { "epoch": 1.08, "learning_rate": 4.557873170013144e-06, "logits/chosen": -1.9731496572494507, "logits/rejected": -1.8656017780303955, "logps/chosen": -79.31033325195312, "logps/rejected": -23.201128005981445, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": 5.233642578125, "rewards/margins": 4.348575592041016, "rewards/rejected": 0.8850671648979187, "step": 4901 }, { "epoch": 1.08, "learning_rate": 4.556087891278742e-06, "logits/chosen": -1.9001799821853638, "logits/rejected": -1.9050953388214111, "logps/chosen": -23.802562713623047, "logps/rejected": -32.57948684692383, "loss": 0.3166, "rewards/accuracies": 1.0, "rewards/chosen": 3.6431567668914795, "rewards/margins": 0.6572856903076172, "rewards/rejected": 2.9858710765838623, "step": 4902 }, { "epoch": 1.09, "learning_rate": 4.55430266958598e-06, "logits/chosen": -1.9898085594177246, "logits/rejected": -1.8875426054000854, "logps/chosen": -97.44246673583984, "logps/rejected": -40.726078033447266, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": 6.460011959075928, "rewards/margins": 4.368327617645264, "rewards/rejected": 2.091684341430664, "step": 4903 }, { "epoch": 1.09, "learning_rate": 4.552517505164251e-06, "logits/chosen": -1.8635940551757812, "logits/rejected": -1.8431710004806519, "logps/chosen": -39.08488464355469, "logps/rejected": -29.693735122680664, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 2.94431471824646, "rewards/margins": 0.7550570964813232, "rewards/rejected": 2.1892576217651367, "step": 4904 }, { "epoch": 1.09, "learning_rate": 4.550732398242949e-06, "logits/chosen": -1.9618805646896362, "logits/rejected": -1.7511231899261475, "logps/chosen": -100.9659423828125, "logps/rejected": -38.11935806274414, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 6.6580810546875, "rewards/margins": 4.172863960266113, "rewards/rejected": 2.4852168560028076, "step": 4905 }, { "epoch": 1.09, "learning_rate": 4.548947349051452e-06, "logits/chosen": -1.9258099794387817, "logits/rejected": -1.9258099794387817, "logps/chosen": -44.688995361328125, "logps/rejected": -44.688995361328125, "loss": 0.3818, "rewards/accuracies": 0.0, "rewards/chosen": 4.596072673797607, "rewards/margins": 0.0, "rewards/rejected": 4.596072673797607, "step": 4906 }, { "epoch": 1.09, "learning_rate": 4.547162357819137e-06, "logits/chosen": -1.9812687635421753, "logits/rejected": -1.9504833221435547, "logps/chosen": -28.32811164855957, "logps/rejected": -57.33270263671875, "loss": 1.3316, "rewards/accuracies": 0.0, "rewards/chosen": 4.531623840332031, "rewards/margins": -0.857759952545166, "rewards/rejected": 5.389383792877197, "step": 4907 }, { "epoch": 1.09, "learning_rate": 4.54537742477537e-06, "logits/chosen": -1.972774863243103, "logits/rejected": -1.903891682624817, "logps/chosen": -103.46533966064453, "logps/rejected": -77.74966430664062, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": 4.969028472900391, "rewards/margins": 3.091036319732666, "rewards/rejected": 1.8779922723770142, "step": 4908 }, { "epoch": 1.09, "learning_rate": 4.543592550149512e-06, "logits/chosen": -1.8480225801467896, "logits/rejected": -1.746035099029541, "logps/chosen": -47.21320724487305, "logps/rejected": -58.73500061035156, "loss": 0.5752, "rewards/accuracies": 0.0, "rewards/chosen": 3.5521304607391357, "rewards/margins": -0.10505795478820801, "rewards/rejected": 3.6571884155273438, "step": 4909 }, { "epoch": 1.09, "learning_rate": 4.541807734170915e-06, "logits/chosen": -1.8611887693405151, "logits/rejected": -1.7254976034164429, "logps/chosen": -98.36965942382812, "logps/rejected": -20.79654884338379, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 6.450204372406006, "rewards/margins": 4.698687553405762, "rewards/rejected": 1.7515169382095337, "step": 4910 }, { "epoch": 1.09, "learning_rate": 4.540022977068922e-06, "logits/chosen": -2.068085193634033, "logits/rejected": -2.038346290588379, "logps/chosen": -42.07948684692383, "logps/rejected": -71.78173828125, "loss": 0.2703, "rewards/accuracies": 1.0, "rewards/chosen": 2.9564175605773926, "rewards/margins": 0.47725868225097656, "rewards/rejected": 2.479158878326416, "step": 4911 }, { "epoch": 1.09, "learning_rate": 4.53823827907287e-06, "logits/chosen": -1.8287206888198853, "logits/rejected": -1.7263643741607666, "logps/chosen": -117.23532104492188, "logps/rejected": -24.414104461669922, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 6.729671001434326, "rewards/margins": 6.323156833648682, "rewards/rejected": 0.40651437640190125, "step": 4912 }, { "epoch": 1.09, "learning_rate": 4.536453640412088e-06, "logits/chosen": -2.0374536514282227, "logits/rejected": -2.045924186706543, "logps/chosen": -94.32637786865234, "logps/rejected": -29.967174530029297, "loss": 0.1091, "rewards/accuracies": 1.0, "rewards/chosen": 6.245938777923584, "rewards/margins": 3.2087831497192383, "rewards/rejected": 3.0371556282043457, "step": 4913 }, { "epoch": 1.09, "learning_rate": 4.534669061315901e-06, "logits/chosen": -1.9230902194976807, "logits/rejected": -1.9209167957305908, "logps/chosen": -98.68033599853516, "logps/rejected": -169.50405883789062, "loss": 0.1288, "rewards/accuracies": 1.0, "rewards/chosen": 8.362157821655273, "rewards/margins": 1.3711953163146973, "rewards/rejected": 6.990962505340576, "step": 4914 }, { "epoch": 1.09, "learning_rate": 4.532884542013619e-06, "logits/chosen": -1.614455223083496, "logits/rejected": -1.5918740034103394, "logps/chosen": -24.25884246826172, "logps/rejected": -123.90648651123047, "loss": 0.1919, "rewards/accuracies": 1.0, "rewards/chosen": 3.8460934162139893, "rewards/margins": 0.767714262008667, "rewards/rejected": 3.0783791542053223, "step": 4915 }, { "epoch": 1.09, "learning_rate": 4.531100082734553e-06, "logits/chosen": -1.8139127492904663, "logits/rejected": -1.7995116710662842, "logps/chosen": -36.708091735839844, "logps/rejected": -60.2847785949707, "loss": 0.1491, "rewards/accuracies": 1.0, "rewards/chosen": 3.2951416969299316, "rewards/margins": 1.491308331489563, "rewards/rejected": 1.8038333654403687, "step": 4916 }, { "epoch": 1.09, "learning_rate": 4.529315683707997e-06, "logits/chosen": -1.9405133724212646, "logits/rejected": -1.8706938028335571, "logps/chosen": -100.83838653564453, "logps/rejected": -51.934959411621094, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": 7.0156168937683105, "rewards/margins": 2.79058837890625, "rewards/rejected": 4.2250285148620605, "step": 4917 }, { "epoch": 1.09, "learning_rate": 4.527531345163245e-06, "logits/chosen": -1.9081026315689087, "logits/rejected": -1.8459786176681519, "logps/chosen": -147.126220703125, "logps/rejected": -62.1219482421875, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": 7.628879070281982, "rewards/margins": 2.9053916931152344, "rewards/rejected": 4.723487377166748, "step": 4918 }, { "epoch": 1.09, "learning_rate": 4.52574706732958e-06, "logits/chosen": -1.7304106950759888, "logits/rejected": -1.707275390625, "logps/chosen": -17.953426361083984, "logps/rejected": -4.613960266113281, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 2.361412525177002, "rewards/margins": 1.4400033950805664, "rewards/rejected": 0.9214091300964355, "step": 4919 }, { "epoch": 1.09, "learning_rate": 4.523962850436276e-06, "logits/chosen": -1.7279040813446045, "logits/rejected": -1.67300283908844, "logps/chosen": -59.98921203613281, "logps/rejected": -83.17884063720703, "loss": 0.3827, "rewards/accuracies": 1.0, "rewards/chosen": 3.7168502807617188, "rewards/margins": 2.590823173522949, "rewards/rejected": 1.12602698802948, "step": 4920 }, { "epoch": 1.09, "learning_rate": 4.5221786947126026e-06, "logits/chosen": -2.034247636795044, "logits/rejected": -1.9532585144042969, "logps/chosen": -71.75597381591797, "logps/rejected": -115.9072036743164, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 11.005448341369629, "rewards/margins": 2.564700126647949, "rewards/rejected": 8.44074821472168, "step": 4921 }, { "epoch": 1.09, "learning_rate": 4.5203946003878156e-06, "logits/chosen": -1.7303431034088135, "logits/rejected": -1.780655860900879, "logps/chosen": -50.459476470947266, "logps/rejected": -75.15020751953125, "loss": 1.8411, "rewards/accuracies": 0.0, "rewards/chosen": 4.52790641784668, "rewards/margins": -3.6510581970214844, "rewards/rejected": 8.178964614868164, "step": 4922 }, { "epoch": 1.09, "learning_rate": 4.518610567691171e-06, "logits/chosen": -1.9427684545516968, "logits/rejected": -1.7222118377685547, "logps/chosen": -127.59803009033203, "logps/rejected": -63.23884201049805, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 5.505744457244873, "rewards/margins": 6.046968936920166, "rewards/rejected": -0.5412243008613586, "step": 4923 }, { "epoch": 1.09, "learning_rate": 4.516826596851911e-06, "logits/chosen": -2.1427817344665527, "logits/rejected": -2.0968124866485596, "logps/chosen": -110.54202270507812, "logps/rejected": -100.93574523925781, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": 7.887750148773193, "rewards/margins": 3.80961275100708, "rewards/rejected": 4.078137397766113, "step": 4924 }, { "epoch": 1.09, "learning_rate": 4.515042688099273e-06, "logits/chosen": -1.7828203439712524, "logits/rejected": -1.6829888820648193, "logps/chosen": -103.49170684814453, "logps/rejected": -55.99151611328125, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": 6.204312801361084, "rewards/margins": 4.02897834777832, "rewards/rejected": 2.1753342151641846, "step": 4925 }, { "epoch": 1.09, "learning_rate": 4.513258841662482e-06, "logits/chosen": -2.1808552742004395, "logits/rejected": -2.189566135406494, "logps/chosen": -73.85746002197266, "logps/rejected": -96.323486328125, "loss": 1.6151, "rewards/accuracies": 0.0, "rewards/chosen": 4.881110668182373, "rewards/margins": -1.8812460899353027, "rewards/rejected": 6.762356758117676, "step": 4926 }, { "epoch": 1.09, "learning_rate": 4.5114750577707606e-06, "logits/chosen": -1.6127629280090332, "logits/rejected": -1.5836048126220703, "logps/chosen": -15.577213287353516, "logps/rejected": -15.225857734680176, "loss": 0.6405, "rewards/accuracies": 1.0, "rewards/chosen": 1.5701093673706055, "rewards/margins": 0.1261376142501831, "rewards/rejected": 1.4439717531204224, "step": 4927 }, { "epoch": 1.09, "learning_rate": 4.509691336653319e-06, "logits/chosen": -1.8722373247146606, "logits/rejected": -1.8413481712341309, "logps/chosen": -64.95272827148438, "logps/rejected": -76.17196655273438, "loss": 0.2466, "rewards/accuracies": 1.0, "rewards/chosen": 4.2600297927856445, "rewards/margins": 1.0720055103302002, "rewards/rejected": 3.1880242824554443, "step": 4928 }, { "epoch": 1.09, "learning_rate": 4.507907678539364e-06, "logits/chosen": -1.9450172185897827, "logits/rejected": -1.9627771377563477, "logps/chosen": -22.7064266204834, "logps/rejected": -53.26665115356445, "loss": 0.6723, "rewards/accuracies": 0.0, "rewards/chosen": 3.981046438217163, "rewards/margins": -0.05516839027404785, "rewards/rejected": 4.036214828491211, "step": 4929 }, { "epoch": 1.09, "learning_rate": 4.50612408365809e-06, "logits/chosen": -1.8556671142578125, "logits/rejected": -1.772601842880249, "logps/chosen": -65.4058609008789, "logps/rejected": -41.28120803833008, "loss": 0.1043, "rewards/accuracies": 1.0, "rewards/chosen": 2.1156165599823, "rewards/margins": 1.4680030345916748, "rewards/rejected": 0.647613525390625, "step": 4930 }, { "epoch": 1.09, "learning_rate": 4.50434055223868e-06, "logits/chosen": -2.0378403663635254, "logits/rejected": -2.016601324081421, "logps/chosen": -95.3466796875, "logps/rejected": -144.5589141845703, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": 9.813082695007324, "rewards/margins": 4.260056972503662, "rewards/rejected": 5.553025722503662, "step": 4931 }, { "epoch": 1.09, "learning_rate": 4.502557084510321e-06, "logits/chosen": -2.0344300270080566, "logits/rejected": -2.015357494354248, "logps/chosen": -54.180938720703125, "logps/rejected": -44.08980941772461, "loss": 0.3112, "rewards/accuracies": 1.0, "rewards/chosen": 4.359350681304932, "rewards/margins": 2.045600652694702, "rewards/rejected": 2.3137500286102295, "step": 4932 }, { "epoch": 1.09, "learning_rate": 4.500773680702178e-06, "logits/chosen": -1.7124043703079224, "logits/rejected": -1.6552644968032837, "logps/chosen": -42.42730712890625, "logps/rejected": -62.162662506103516, "loss": 0.3105, "rewards/accuracies": 1.0, "rewards/chosen": 4.6589884757995605, "rewards/margins": 0.28775453567504883, "rewards/rejected": 4.371233940124512, "step": 4933 }, { "epoch": 1.09, "learning_rate": 4.498990341043419e-06, "logits/chosen": -1.9601455926895142, "logits/rejected": -1.9157061576843262, "logps/chosen": -30.692087173461914, "logps/rejected": -27.272083282470703, "loss": 0.1422, "rewards/accuracies": 1.0, "rewards/chosen": 2.26187801361084, "rewards/margins": 1.212845802307129, "rewards/rejected": 1.049032211303711, "step": 4934 }, { "epoch": 1.09, "learning_rate": 4.497207065763196e-06, "logits/chosen": -1.7148858308792114, "logits/rejected": -1.6963719129562378, "logps/chosen": -4.795953273773193, "logps/rejected": -2.5110177993774414, "loss": 2.6372, "rewards/accuracies": 1.0, "rewards/chosen": 1.8842054605484009, "rewards/margins": 0.4403904676437378, "rewards/rejected": 1.443814992904663, "step": 4935 }, { "epoch": 1.09, "learning_rate": 4.495423855090658e-06, "logits/chosen": -1.660606026649475, "logits/rejected": -1.6152091026306152, "logps/chosen": -82.13368225097656, "logps/rejected": -81.93812561035156, "loss": 0.0814, "rewards/accuracies": 1.0, "rewards/chosen": 4.338672161102295, "rewards/margins": 1.8340089321136475, "rewards/rejected": 2.5046632289886475, "step": 4936 }, { "epoch": 1.09, "learning_rate": 4.493640709254939e-06, "logits/chosen": -2.059920310974121, "logits/rejected": -2.0927326679229736, "logps/chosen": -66.20488739013672, "logps/rejected": -102.52023315429688, "loss": 0.4534, "rewards/accuracies": 0.0, "rewards/chosen": 6.505774974822998, "rewards/margins": -0.334775447845459, "rewards/rejected": 6.840550422668457, "step": 4937 }, { "epoch": 1.09, "learning_rate": 4.491857628485173e-06, "logits/chosen": -2.082760810852051, "logits/rejected": -2.068739891052246, "logps/chosen": -46.09940719604492, "logps/rejected": -34.060672760009766, "loss": 0.2632, "rewards/accuracies": 1.0, "rewards/chosen": 3.0109212398529053, "rewards/margins": 0.5745630264282227, "rewards/rejected": 2.4363582134246826, "step": 4938 }, { "epoch": 1.09, "learning_rate": 4.490074613010479e-06, "logits/chosen": -1.8432155847549438, "logits/rejected": -1.7207268476486206, "logps/chosen": -54.7930908203125, "logps/rejected": -20.367355346679688, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 3.5384292602539062, "rewards/margins": 2.420503616333008, "rewards/rejected": 1.1179255247116089, "step": 4939 }, { "epoch": 1.09, "learning_rate": 4.488291663059974e-06, "logits/chosen": -1.728873372077942, "logits/rejected": -1.728873372077942, "logps/chosen": -52.115638732910156, "logps/rejected": -52.115638732910156, "loss": 0.6108, "rewards/accuracies": 0.0, "rewards/chosen": 6.24585485458374, "rewards/margins": 0.0, "rewards/rejected": 6.24585485458374, "step": 4940 }, { "epoch": 1.09, "learning_rate": 4.486508778862759e-06, "logits/chosen": -1.8695605993270874, "logits/rejected": -1.7642707824707031, "logps/chosen": -41.19134521484375, "logps/rejected": -15.428638458251953, "loss": 0.197, "rewards/accuracies": 1.0, "rewards/chosen": 2.595378875732422, "rewards/margins": 2.0988616943359375, "rewards/rejected": 0.4965171813964844, "step": 4941 }, { "epoch": 1.09, "learning_rate": 4.48472596064793e-06, "logits/chosen": -1.834138035774231, "logits/rejected": -1.724882960319519, "logps/chosen": -46.663360595703125, "logps/rejected": -28.357072830200195, "loss": 0.0573, "rewards/accuracies": 1.0, "rewards/chosen": 2.3274734020233154, "rewards/margins": 2.122624635696411, "rewards/rejected": 0.20484887063503265, "step": 4942 }, { "epoch": 1.09, "learning_rate": 4.4829432086445784e-06, "logits/chosen": -1.7329225540161133, "logits/rejected": -1.807990550994873, "logps/chosen": -37.82571029663086, "logps/rejected": -53.355018615722656, "loss": 1.0304, "rewards/accuracies": 0.0, "rewards/chosen": 3.0681850910186768, "rewards/margins": -0.8198864459991455, "rewards/rejected": 3.8880715370178223, "step": 4943 }, { "epoch": 1.09, "learning_rate": 4.48116052308178e-06, "logits/chosen": -2.0737571716308594, "logits/rejected": -2.0737571716308594, "logps/chosen": -41.851356506347656, "logps/rejected": -41.851356506347656, "loss": 1.9317, "rewards/accuracies": 0.0, "rewards/chosen": 3.784027099609375, "rewards/margins": 0.0, "rewards/rejected": 3.784027099609375, "step": 4944 }, { "epoch": 1.09, "learning_rate": 4.479377904188609e-06, "logits/chosen": -2.0725691318511963, "logits/rejected": -2.0705409049987793, "logps/chosen": -82.56458282470703, "logps/rejected": -39.124725341796875, "loss": 0.323, "rewards/accuracies": 1.0, "rewards/chosen": 3.7452774047851562, "rewards/margins": 0.329193115234375, "rewards/rejected": 3.4160842895507812, "step": 4945 }, { "epoch": 1.09, "learning_rate": 4.477595352194123e-06, "logits/chosen": -2.2473866939544678, "logits/rejected": -2.227734088897705, "logps/chosen": -50.463966369628906, "logps/rejected": -49.75373458862305, "loss": 0.3124, "rewards/accuracies": 1.0, "rewards/chosen": 4.038655757904053, "rewards/margins": 1.7200860977172852, "rewards/rejected": 2.3185696601867676, "step": 4946 }, { "epoch": 1.09, "learning_rate": 4.475812867327381e-06, "logits/chosen": -2.3528544902801514, "logits/rejected": -2.3218274116516113, "logps/chosen": -30.957216262817383, "logps/rejected": -183.91648864746094, "loss": 2.1463, "rewards/accuracies": 0.0, "rewards/chosen": 4.314971923828125, "rewards/margins": -2.3309264183044434, "rewards/rejected": 6.645898342132568, "step": 4947 }, { "epoch": 1.1, "learning_rate": 4.474030449817423e-06, "logits/chosen": -1.7940863370895386, "logits/rejected": -1.7940863370895386, "logps/chosen": -5.595643997192383, "logps/rejected": -5.595643997192383, "loss": 0.3469, "rewards/accuracies": 0.0, "rewards/chosen": 1.89627206325531, "rewards/margins": 0.0, "rewards/rejected": 1.89627206325531, "step": 4948 }, { "epoch": 1.1, "learning_rate": 4.47224809989329e-06, "logits/chosen": -1.6519304513931274, "logits/rejected": -1.6068048477172852, "logps/chosen": -46.078712463378906, "logps/rejected": -28.143169403076172, "loss": 0.5888, "rewards/accuracies": 0.0, "rewards/chosen": 1.9981101751327515, "rewards/margins": -0.8054779767990112, "rewards/rejected": 2.8035881519317627, "step": 4949 }, { "epoch": 1.1, "learning_rate": 4.4704658177840076e-06, "logits/chosen": -1.9139854907989502, "logits/rejected": -1.883477807044983, "logps/chosen": -30.018478393554688, "logps/rejected": -55.110633850097656, "loss": 0.43, "rewards/accuracies": 0.0, "rewards/chosen": 2.8351941108703613, "rewards/margins": -0.006113767623901367, "rewards/rejected": 2.8413078784942627, "step": 4950 }, { "epoch": 1.1, "learning_rate": 4.468683603718592e-06, "logits/chosen": -1.9099901914596558, "logits/rejected": -1.9405615329742432, "logps/chosen": -60.31306076049805, "logps/rejected": -38.733985900878906, "loss": 0.5924, "rewards/accuracies": 1.0, "rewards/chosen": 3.959442615509033, "rewards/margins": 2.1492152214050293, "rewards/rejected": 1.8102272748947144, "step": 4951 }, { "epoch": 1.1, "learning_rate": 4.4669014579260595e-06, "logits/chosen": -1.6327133178710938, "logits/rejected": -1.6160472631454468, "logps/chosen": -45.83765411376953, "logps/rejected": -96.92021179199219, "loss": 0.0663, "rewards/accuracies": 1.0, "rewards/chosen": 3.374314069747925, "rewards/margins": 2.0790281295776367, "rewards/rejected": 1.2952858209609985, "step": 4952 }, { "epoch": 1.1, "learning_rate": 4.465119380635406e-06, "logits/chosen": -2.000887155532837, "logits/rejected": -1.9832338094711304, "logps/chosen": -22.76071548461914, "logps/rejected": -41.13764190673828, "loss": 1.1103, "rewards/accuracies": 0.0, "rewards/chosen": 3.507277011871338, "rewards/margins": -1.7658562660217285, "rewards/rejected": 5.273133277893066, "step": 4953 }, { "epoch": 1.1, "learning_rate": 4.463337372075627e-06, "logits/chosen": -1.8978487253189087, "logits/rejected": -1.7453703880310059, "logps/chosen": -64.54885864257812, "logps/rejected": -23.998933792114258, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 5.351953983306885, "rewards/margins": 4.469293117523193, "rewards/rejected": 0.8826608657836914, "step": 4954 }, { "epoch": 1.1, "learning_rate": 4.461555432475705e-06, "logits/chosen": -2.1494030952453613, "logits/rejected": -2.0956363677978516, "logps/chosen": -147.3651123046875, "logps/rejected": -34.56553649902344, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 7.886807441711426, "rewards/margins": 4.535315036773682, "rewards/rejected": 3.351492404937744, "step": 4955 }, { "epoch": 1.1, "learning_rate": 4.4597735620646156e-06, "logits/chosen": -1.5919393301010132, "logits/rejected": -1.6127231121063232, "logps/chosen": -23.413379669189453, "logps/rejected": -45.853782653808594, "loss": 0.3597, "rewards/accuracies": 1.0, "rewards/chosen": 2.2771003246307373, "rewards/margins": 0.09817337989807129, "rewards/rejected": 2.178926944732666, "step": 4956 }, { "epoch": 1.1, "learning_rate": 4.457991761071324e-06, "logits/chosen": -2.076627731323242, "logits/rejected": -1.9624338150024414, "logps/chosen": -108.2696762084961, "logps/rejected": -41.342838287353516, "loss": 0.3892, "rewards/accuracies": 0.0, "rewards/chosen": 4.248502254486084, "rewards/margins": -0.13314199447631836, "rewards/rejected": 4.381644248962402, "step": 4957 }, { "epoch": 1.1, "learning_rate": 4.4562100297247905e-06, "logits/chosen": -1.543433666229248, "logits/rejected": -1.4829703569412231, "logps/chosen": -31.407299041748047, "logps/rejected": -70.20497131347656, "loss": 0.287, "rewards/accuracies": 1.0, "rewards/chosen": 4.630412578582764, "rewards/margins": 0.39354944229125977, "rewards/rejected": 4.236863136291504, "step": 4958 }, { "epoch": 1.1, "learning_rate": 4.454428368253959e-06, "logits/chosen": -2.177243709564209, "logits/rejected": -2.152003288269043, "logps/chosen": -110.68460083007812, "logps/rejected": -83.97386169433594, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": 6.080650329589844, "rewards/margins": 3.834958553314209, "rewards/rejected": 2.2456917762756348, "step": 4959 }, { "epoch": 1.1, "learning_rate": 4.452646776887772e-06, "logits/chosen": -1.7082902193069458, "logits/rejected": -1.7082902193069458, "logps/chosen": -7.550636291503906, "logps/rejected": -7.550636291503906, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.8962373733520508, "rewards/margins": 0.0, "rewards/rejected": 0.8962373733520508, "step": 4960 }, { "epoch": 1.1, "learning_rate": 4.4508652558551576e-06, "logits/chosen": -1.9178327322006226, "logits/rejected": -1.8032116889953613, "logps/chosen": -112.12911987304688, "logps/rejected": -43.06302261352539, "loss": 0.1437, "rewards/accuracies": 1.0, "rewards/chosen": 5.529786586761475, "rewards/margins": 3.5518388748168945, "rewards/rejected": 1.9779475927352905, "step": 4961 }, { "epoch": 1.1, "learning_rate": 4.449083805385037e-06, "logits/chosen": -2.1088759899139404, "logits/rejected": -2.1088759899139404, "logps/chosen": -46.37542724609375, "logps/rejected": -46.37542724609375, "loss": 0.3722, "rewards/accuracies": 0.0, "rewards/chosen": 7.348000526428223, "rewards/margins": 0.0, "rewards/rejected": 7.348000526428223, "step": 4962 }, { "epoch": 1.1, "learning_rate": 4.447302425706326e-06, "logits/chosen": -1.8102625608444214, "logits/rejected": -1.8135327100753784, "logps/chosen": -50.68534851074219, "logps/rejected": -37.32677459716797, "loss": 1.4216, "rewards/accuracies": 0.0, "rewards/chosen": 3.5511093139648438, "rewards/margins": -2.65632963180542, "rewards/rejected": 6.207438945770264, "step": 4963 }, { "epoch": 1.1, "learning_rate": 4.44552111704792e-06, "logits/chosen": -1.6821553707122803, "logits/rejected": -1.6385995149612427, "logps/chosen": -64.73420715332031, "logps/rejected": -50.471839904785156, "loss": 0.1442, "rewards/accuracies": 1.0, "rewards/chosen": 3.6180222034454346, "rewards/margins": 1.1665160655975342, "rewards/rejected": 2.4515061378479004, "step": 4964 }, { "epoch": 1.1, "learning_rate": 4.443739879638722e-06, "logits/chosen": -2.0430634021759033, "logits/rejected": -2.0401830673217773, "logps/chosen": -27.117115020751953, "logps/rejected": -83.54005432128906, "loss": 0.4805, "rewards/accuracies": 0.0, "rewards/chosen": 2.189624071121216, "rewards/margins": -0.4782531261444092, "rewards/rejected": 2.667877197265625, "step": 4965 }, { "epoch": 1.1, "learning_rate": 4.441958713707608e-06, "logits/chosen": -1.8127753734588623, "logits/rejected": -1.8709943294525146, "logps/chosen": -91.43878173828125, "logps/rejected": -115.07038879394531, "loss": 0.1245, "rewards/accuracies": 1.0, "rewards/chosen": 7.895613193511963, "rewards/margins": 1.9832887649536133, "rewards/rejected": 5.91232442855835, "step": 4966 }, { "epoch": 1.1, "learning_rate": 4.4401776194834615e-06, "logits/chosen": -1.9752535820007324, "logits/rejected": -1.9519284963607788, "logps/chosen": -45.40290832519531, "logps/rejected": -50.629058837890625, "loss": 0.2154, "rewards/accuracies": 1.0, "rewards/chosen": 3.2405083179473877, "rewards/margins": 0.9893944263458252, "rewards/rejected": 2.2511138916015625, "step": 4967 }, { "epoch": 1.1, "learning_rate": 4.438396597195143e-06, "logits/chosen": -1.7442349195480347, "logits/rejected": -1.7794140577316284, "logps/chosen": -30.105636596679688, "logps/rejected": -57.68693542480469, "loss": 1.1487, "rewards/accuracies": 0.0, "rewards/chosen": 4.997973918914795, "rewards/margins": -2.141697406768799, "rewards/rejected": 7.139671325683594, "step": 4968 }, { "epoch": 1.1, "learning_rate": 4.436615647071514e-06, "logits/chosen": -1.8407319784164429, "logits/rejected": -1.7478182315826416, "logps/chosen": -35.211891174316406, "logps/rejected": -25.019550323486328, "loss": 0.1702, "rewards/accuracies": 1.0, "rewards/chosen": 3.84721302986145, "rewards/margins": 0.9594845771789551, "rewards/rejected": 2.887728452682495, "step": 4969 }, { "epoch": 1.1, "learning_rate": 4.4348347693414175e-06, "logits/chosen": -1.898137092590332, "logits/rejected": -1.904350757598877, "logps/chosen": -34.496490478515625, "logps/rejected": -56.79319763183594, "loss": 0.7219, "rewards/accuracies": 0.0, "rewards/chosen": 3.8136565685272217, "rewards/margins": -1.134521722793579, "rewards/rejected": 4.948178291320801, "step": 4970 }, { "epoch": 1.1, "learning_rate": 4.433053964233696e-06, "logits/chosen": -1.8087217807769775, "logits/rejected": -1.8109512329101562, "logps/chosen": -24.078142166137695, "logps/rejected": -30.933425903320312, "loss": 0.4209, "rewards/accuracies": 0.0, "rewards/chosen": 1.4992605447769165, "rewards/margins": -0.043985962867736816, "rewards/rejected": 1.5432465076446533, "step": 4971 }, { "epoch": 1.1, "learning_rate": 4.431273231977176e-06, "logits/chosen": -2.0982697010040283, "logits/rejected": -2.021521806716919, "logps/chosen": -68.97198486328125, "logps/rejected": -58.567142486572266, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": 2.6242311000823975, "rewards/margins": 1.8236066102981567, "rewards/rejected": 0.8006244897842407, "step": 4972 }, { "epoch": 1.1, "learning_rate": 4.4294925728006785e-06, "logits/chosen": -2.1333577632904053, "logits/rejected": -1.9393422603607178, "logps/chosen": -133.9739532470703, "logps/rejected": -32.358970642089844, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 6.768516540527344, "rewards/margins": 4.9232378005981445, "rewards/rejected": 1.8452786207199097, "step": 4973 }, { "epoch": 1.1, "learning_rate": 4.427711986933016e-06, "logits/chosen": -1.7555409669876099, "logits/rejected": -1.7548997402191162, "logps/chosen": -47.872962951660156, "logps/rejected": -81.21456909179688, "loss": 0.0768, "rewards/accuracies": 1.0, "rewards/chosen": 4.875100135803223, "rewards/margins": 1.800485372543335, "rewards/rejected": 3.0746147632598877, "step": 4974 }, { "epoch": 1.1, "learning_rate": 4.425931474602985e-06, "logits/chosen": -1.7335340976715088, "logits/rejected": -1.704819917678833, "logps/chosen": -45.206878662109375, "logps/rejected": -17.417165756225586, "loss": 0.2994, "rewards/accuracies": 1.0, "rewards/chosen": 2.167487382888794, "rewards/margins": 1.3626430034637451, "rewards/rejected": 0.804844319820404, "step": 4975 }, { "epoch": 1.1, "learning_rate": 4.424151036039381e-06, "logits/chosen": -1.595334768295288, "logits/rejected": -1.595334768295288, "logps/chosen": -75.84461212158203, "logps/rejected": -75.84461212158203, "loss": 0.3629, "rewards/accuracies": 0.0, "rewards/chosen": 3.05830454826355, "rewards/margins": 0.0, "rewards/rejected": 3.05830454826355, "step": 4976 }, { "epoch": 1.1, "learning_rate": 4.422370671470984e-06, "logits/chosen": -1.504132628440857, "logits/rejected": -1.4944807291030884, "logps/chosen": -57.035186767578125, "logps/rejected": -47.330230712890625, "loss": 0.1411, "rewards/accuracies": 1.0, "rewards/chosen": 5.53481912612915, "rewards/margins": 1.135847568511963, "rewards/rejected": 4.3989715576171875, "step": 4977 }, { "epoch": 1.1, "learning_rate": 4.420590381126567e-06, "logits/chosen": -2.219517469406128, "logits/rejected": -2.243088960647583, "logps/chosen": -100.88522338867188, "logps/rejected": -61.52867889404297, "loss": 0.1786, "rewards/accuracies": 1.0, "rewards/chosen": 5.757462978363037, "rewards/margins": 1.1017675399780273, "rewards/rejected": 4.65569543838501, "step": 4978 }, { "epoch": 1.1, "learning_rate": 4.418810165234893e-06, "logits/chosen": -1.7166399955749512, "logits/rejected": -1.7166399955749512, "logps/chosen": -34.23237991333008, "logps/rejected": -34.23237991333008, "loss": 0.3485, "rewards/accuracies": 0.0, "rewards/chosen": 2.571428060531616, "rewards/margins": 0.0, "rewards/rejected": 2.571428060531616, "step": 4979 }, { "epoch": 1.1, "learning_rate": 4.417030024024716e-06, "logits/chosen": -2.0540566444396973, "logits/rejected": -2.003122568130493, "logps/chosen": -75.48836517333984, "logps/rejected": -63.93355941772461, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": 3.8670341968536377, "rewards/margins": 2.7815518379211426, "rewards/rejected": 1.0854824781417847, "step": 4980 }, { "epoch": 1.1, "learning_rate": 4.415249957724781e-06, "logits/chosen": -1.9513814449310303, "logits/rejected": -1.8393315076828003, "logps/chosen": -90.31396484375, "logps/rejected": -54.909523010253906, "loss": 0.0903, "rewards/accuracies": 1.0, "rewards/chosen": 5.366000652313232, "rewards/margins": 2.515662670135498, "rewards/rejected": 2.8503379821777344, "step": 4981 }, { "epoch": 1.1, "learning_rate": 4.413469966563817e-06, "logits/chosen": -1.8971959352493286, "logits/rejected": -1.7849245071411133, "logps/chosen": -62.482765197753906, "logps/rejected": -72.69091796875, "loss": 0.0672, "rewards/accuracies": 1.0, "rewards/chosen": 4.97066593170166, "rewards/margins": 2.184199571609497, "rewards/rejected": 2.786466360092163, "step": 4982 }, { "epoch": 1.1, "learning_rate": 4.411690050770557e-06, "logits/chosen": -1.9343715906143188, "logits/rejected": -1.9812004566192627, "logps/chosen": -32.48170852661133, "logps/rejected": -94.00868225097656, "loss": 0.2953, "rewards/accuracies": 1.0, "rewards/chosen": 6.561336040496826, "rewards/margins": 0.9182953834533691, "rewards/rejected": 5.643040657043457, "step": 4983 }, { "epoch": 1.1, "learning_rate": 4.409910210573707e-06, "logits/chosen": -2.0030605792999268, "logits/rejected": -1.9991894960403442, "logps/chosen": -62.498687744140625, "logps/rejected": -114.9793701171875, "loss": 0.1218, "rewards/accuracies": 1.0, "rewards/chosen": 9.460161209106445, "rewards/margins": 1.9379901885986328, "rewards/rejected": 7.5221710205078125, "step": 4984 }, { "epoch": 1.1, "learning_rate": 4.408130446201978e-06, "logits/chosen": -1.7156049013137817, "logits/rejected": -1.6298426389694214, "logps/chosen": -34.669273376464844, "logps/rejected": -29.714468002319336, "loss": 0.3685, "rewards/accuracies": 1.0, "rewards/chosen": 3.926027774810791, "rewards/margins": 2.031780481338501, "rewards/rejected": 1.89424729347229, "step": 4985 }, { "epoch": 1.1, "learning_rate": 4.406350757884064e-06, "logits/chosen": -2.2124204635620117, "logits/rejected": -2.2236790657043457, "logps/chosen": -31.129249572753906, "logps/rejected": -72.01380920410156, "loss": 1.4688, "rewards/accuracies": 0.0, "rewards/chosen": 3.8229806423187256, "rewards/margins": -2.4575798511505127, "rewards/rejected": 6.280560493469238, "step": 4986 }, { "epoch": 1.1, "learning_rate": 4.404571145848652e-06, "logits/chosen": -1.7980986833572388, "logits/rejected": -1.7797151803970337, "logps/chosen": -47.23677062988281, "logps/rejected": -78.95199584960938, "loss": 0.2038, "rewards/accuracies": 1.0, "rewards/chosen": 5.262730598449707, "rewards/margins": 1.0338358879089355, "rewards/rejected": 4.2288947105407715, "step": 4987 }, { "epoch": 1.1, "learning_rate": 4.402791610324413e-06, "logits/chosen": -1.9670745134353638, "logits/rejected": -1.8896002769470215, "logps/chosen": -138.83338928222656, "logps/rejected": -70.80339050292969, "loss": 0.0641, "rewards/accuracies": 1.0, "rewards/chosen": 8.114824295043945, "rewards/margins": 3.0506129264831543, "rewards/rejected": 5.064211368560791, "step": 4988 }, { "epoch": 1.1, "learning_rate": 4.4010121515400195e-06, "logits/chosen": -1.7657582759857178, "logits/rejected": -1.6357049942016602, "logps/chosen": -103.72796630859375, "logps/rejected": -46.068824768066406, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 6.323996067047119, "rewards/margins": 4.21071720123291, "rewards/rejected": 2.113279104232788, "step": 4989 }, { "epoch": 1.1, "learning_rate": 4.3992327697241225e-06, "logits/chosen": -1.7037417888641357, "logits/rejected": -1.7135546207427979, "logps/chosen": -24.484363555908203, "logps/rejected": -58.94412612915039, "loss": 0.4722, "rewards/accuracies": 1.0, "rewards/chosen": 2.3018200397491455, "rewards/margins": 0.06838846206665039, "rewards/rejected": 2.233431577682495, "step": 4990 }, { "epoch": 1.1, "learning_rate": 4.397453465105372e-06, "logits/chosen": -1.895116925239563, "logits/rejected": -1.8513028621673584, "logps/chosen": -111.72665405273438, "logps/rejected": -60.942901611328125, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 6.6347198486328125, "rewards/margins": 2.085422992706299, "rewards/rejected": 4.549296855926514, "step": 4991 }, { "epoch": 1.1, "learning_rate": 4.395674237912403e-06, "logits/chosen": -1.7892510890960693, "logits/rejected": -1.8654636144638062, "logps/chosen": -61.41964340209961, "logps/rejected": -37.899173736572266, "loss": 0.3394, "rewards/accuracies": 1.0, "rewards/chosen": 4.704666614532471, "rewards/margins": 0.07016515731811523, "rewards/rejected": 4.6345014572143555, "step": 4992 }, { "epoch": 1.11, "learning_rate": 4.393895088373839e-06, "logits/chosen": -1.9815163612365723, "logits/rejected": -1.9815163612365723, "logps/chosen": -45.42774200439453, "logps/rejected": -45.42774200439453, "loss": 0.3496, "rewards/accuracies": 0.0, "rewards/chosen": 3.1448211669921875, "rewards/margins": 0.0, "rewards/rejected": 3.1448211669921875, "step": 4993 }, { "epoch": 1.11, "learning_rate": 4.392116016718302e-06, "logits/chosen": -1.8736152648925781, "logits/rejected": -1.8203284740447998, "logps/chosen": -27.832408905029297, "logps/rejected": -30.567996978759766, "loss": 0.4401, "rewards/accuracies": 0.0, "rewards/chosen": 2.764008045196533, "rewards/margins": -0.23152685165405273, "rewards/rejected": 2.995534896850586, "step": 4994 }, { "epoch": 1.11, "learning_rate": 4.390337023174394e-06, "logits/chosen": -1.7880960702896118, "logits/rejected": -1.795361042022705, "logps/chosen": -69.61888122558594, "logps/rejected": -70.58273315429688, "loss": 0.2302, "rewards/accuracies": 1.0, "rewards/chosen": 5.247447490692139, "rewards/margins": 0.7868804931640625, "rewards/rejected": 4.460566997528076, "step": 4995 }, { "epoch": 1.11, "learning_rate": 4.388558107970714e-06, "logits/chosen": -1.968510389328003, "logits/rejected": -1.983698844909668, "logps/chosen": -54.92220687866211, "logps/rejected": -74.32186889648438, "loss": 0.6195, "rewards/accuracies": 0.0, "rewards/chosen": 3.825502395629883, "rewards/margins": -0.664668083190918, "rewards/rejected": 4.490170478820801, "step": 4996 }, { "epoch": 1.11, "learning_rate": 4.386779271335845e-06, "logits/chosen": -1.7923861742019653, "logits/rejected": -1.7007447481155396, "logps/chosen": -23.25900650024414, "logps/rejected": -27.311992645263672, "loss": 1.2593, "rewards/accuracies": 0.0, "rewards/chosen": 1.7096179723739624, "rewards/margins": -1.6480134725570679, "rewards/rejected": 3.3576314449310303, "step": 4997 }, { "epoch": 1.11, "learning_rate": 4.385000513498368e-06, "logits/chosen": -1.776811957359314, "logits/rejected": -1.661342978477478, "logps/chosen": -38.111328125, "logps/rejected": -26.684446334838867, "loss": 0.3344, "rewards/accuracies": 1.0, "rewards/chosen": 4.237425327301025, "rewards/margins": 4.006383419036865, "rewards/rejected": 0.23104190826416016, "step": 4998 }, { "epoch": 1.11, "learning_rate": 4.383221834686845e-06, "logits/chosen": -1.7163342237472534, "logits/rejected": -1.7154920101165771, "logps/chosen": -64.83447265625, "logps/rejected": -67.06060791015625, "loss": 0.1792, "rewards/accuracies": 1.0, "rewards/chosen": 4.319858074188232, "rewards/margins": 1.1180682182312012, "rewards/rejected": 3.2017898559570312, "step": 4999 }, { "epoch": 1.11, "learning_rate": 4.381443235129834e-06, "logits/chosen": -1.7156500816345215, "logits/rejected": -1.6132920980453491, "logps/chosen": -55.30375671386719, "logps/rejected": -23.98378562927246, "loss": 0.1638, "rewards/accuracies": 1.0, "rewards/chosen": 2.071279287338257, "rewards/margins": 1.060410976409912, "rewards/rejected": 1.0108683109283447, "step": 5000 }, { "epoch": 1.11, "learning_rate": 4.379664715055883e-06, "logits/chosen": -1.8557393550872803, "logits/rejected": -1.8557393550872803, "logps/chosen": -29.561901092529297, "logps/rejected": -29.561901092529297, "loss": 1.2527, "rewards/accuracies": 0.0, "rewards/chosen": 2.790781021118164, "rewards/margins": 0.0, "rewards/rejected": 2.790781021118164, "step": 5001 }, { "epoch": 1.11, "learning_rate": 4.37788627469352e-06, "logits/chosen": -1.9478511810302734, "logits/rejected": -1.9210424423217773, "logps/chosen": -86.06010437011719, "logps/rejected": -106.16969299316406, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 9.647822380065918, "rewards/margins": 3.5635037422180176, "rewards/rejected": 6.0843186378479, "step": 5002 }, { "epoch": 1.11, "learning_rate": 4.3761079142712784e-06, "logits/chosen": -1.8013474941253662, "logits/rejected": -1.73219895362854, "logps/chosen": -201.27459716796875, "logps/rejected": -73.59323120117188, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": 9.357733726501465, "rewards/margins": 2.766054153442383, "rewards/rejected": 6.591679573059082, "step": 5003 }, { "epoch": 1.11, "learning_rate": 4.3743296340176694e-06, "logits/chosen": -1.6903578042984009, "logits/rejected": -1.6903578042984009, "logps/chosen": -28.815065383911133, "logps/rejected": -28.815065383911133, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 2.175079584121704, "rewards/margins": 0.0, "rewards/rejected": 2.175079584121704, "step": 5004 }, { "epoch": 1.11, "learning_rate": 4.372551434161199e-06, "logits/chosen": -1.9950498342514038, "logits/rejected": -1.9410663843154907, "logps/chosen": -63.19926834106445, "logps/rejected": -58.25157165527344, "loss": 0.1329, "rewards/accuracies": 1.0, "rewards/chosen": 7.7328386306762695, "rewards/margins": 2.426555633544922, "rewards/rejected": 5.306282997131348, "step": 5005 }, { "epoch": 1.11, "learning_rate": 4.370773314930359e-06, "logits/chosen": -2.08892560005188, "logits/rejected": -2.089888572692871, "logps/chosen": -30.813140869140625, "logps/rejected": -53.07838821411133, "loss": 0.4301, "rewards/accuracies": 0.0, "rewards/chosen": 4.29128360748291, "rewards/margins": -0.09997367858886719, "rewards/rejected": 4.391257286071777, "step": 5006 }, { "epoch": 1.11, "learning_rate": 4.368995276553637e-06, "logits/chosen": -1.8448954820632935, "logits/rejected": -1.8675867319107056, "logps/chosen": -30.69571876525879, "logps/rejected": -49.94385528564453, "loss": 0.4472, "rewards/accuracies": 0.0, "rewards/chosen": 3.4347786903381348, "rewards/margins": -0.20979905128479004, "rewards/rejected": 3.644577741622925, "step": 5007 }, { "epoch": 1.11, "learning_rate": 4.367217319259504e-06, "logits/chosen": -1.9947634935379028, "logits/rejected": -1.9618241786956787, "logps/chosen": -32.7236213684082, "logps/rejected": -18.108104705810547, "loss": 0.2827, "rewards/accuracies": 1.0, "rewards/chosen": 3.0465145111083984, "rewards/margins": 0.2802422046661377, "rewards/rejected": 2.7662723064422607, "step": 5008 }, { "epoch": 1.11, "learning_rate": 4.365439443276426e-06, "logits/chosen": -1.8863962888717651, "logits/rejected": -1.8665707111358643, "logps/chosen": -18.701618194580078, "logps/rejected": -59.244606018066406, "loss": 0.6102, "rewards/accuracies": 1.0, "rewards/chosen": 1.9196045398712158, "rewards/margins": 0.7631500959396362, "rewards/rejected": 1.1564544439315796, "step": 5009 }, { "epoch": 1.11, "learning_rate": 4.363661648832852e-06, "logits/chosen": -1.800709843635559, "logits/rejected": -1.3844783306121826, "logps/chosen": -87.77619934082031, "logps/rejected": -62.88838195800781, "loss": 0.4879, "rewards/accuracies": 0.0, "rewards/chosen": 5.536080837249756, "rewards/margins": -0.24727344512939453, "rewards/rejected": 5.78335428237915, "step": 5010 }, { "epoch": 1.11, "learning_rate": 4.361883936157229e-06, "logits/chosen": -1.8270947933197021, "logits/rejected": -1.8744301795959473, "logps/chosen": -65.08181762695312, "logps/rejected": -108.45889282226562, "loss": 0.3602, "rewards/accuracies": 1.0, "rewards/chosen": 3.5155410766601562, "rewards/margins": 0.24584197998046875, "rewards/rejected": 3.2696990966796875, "step": 5011 }, { "epoch": 1.11, "learning_rate": 4.3601063054779835e-06, "logits/chosen": -2.2301855087280273, "logits/rejected": -2.2205731868743896, "logps/chosen": -49.786109924316406, "logps/rejected": -65.28547668457031, "loss": 0.7932, "rewards/accuracies": 1.0, "rewards/chosen": 4.146962642669678, "rewards/margins": 1.164916753768921, "rewards/rejected": 2.982045888900757, "step": 5012 }, { "epoch": 1.11, "learning_rate": 4.35832875702354e-06, "logits/chosen": -2.0454397201538086, "logits/rejected": -1.9741145372390747, "logps/chosen": -96.16636657714844, "logps/rejected": -109.76655578613281, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 8.792576789855957, "rewards/margins": 3.3731322288513184, "rewards/rejected": 5.419444561004639, "step": 5013 }, { "epoch": 1.11, "learning_rate": 4.35655129102231e-06, "logits/chosen": -1.9907361268997192, "logits/rejected": -1.9864295721054077, "logps/chosen": -45.58736038208008, "logps/rejected": -83.8970947265625, "loss": 1.2181, "rewards/accuracies": 0.0, "rewards/chosen": 4.167638778686523, "rewards/margins": -2.3413028717041016, "rewards/rejected": 6.508941650390625, "step": 5014 }, { "epoch": 1.11, "learning_rate": 4.354773907702691e-06, "logits/chosen": -1.7521836757659912, "logits/rejected": -1.7231091260910034, "logps/chosen": -81.08305358886719, "logps/rejected": -86.58070373535156, "loss": 0.945, "rewards/accuracies": 0.0, "rewards/chosen": 7.101219177246094, "rewards/margins": -1.533416748046875, "rewards/rejected": 8.634635925292969, "step": 5015 }, { "epoch": 1.11, "learning_rate": 4.352996607293075e-06, "logits/chosen": -2.0442919731140137, "logits/rejected": -2.0401933193206787, "logps/chosen": -21.751253128051758, "logps/rejected": -94.85250854492188, "loss": 2.7262, "rewards/accuracies": 0.0, "rewards/chosen": 3.3005006313323975, "rewards/margins": -5.322464942932129, "rewards/rejected": 8.622965812683105, "step": 5016 }, { "epoch": 1.11, "learning_rate": 4.351219390021838e-06, "logits/chosen": -2.074232816696167, "logits/rejected": -2.0718657970428467, "logps/chosen": -89.33255004882812, "logps/rejected": -148.65574645996094, "loss": 0.1315, "rewards/accuracies": 1.0, "rewards/chosen": 6.9562578201293945, "rewards/margins": 1.3689513206481934, "rewards/rejected": 5.587306499481201, "step": 5017 }, { "epoch": 1.11, "learning_rate": 4.3494422561173515e-06, "logits/chosen": -1.3351521492004395, "logits/rejected": -1.3192620277404785, "logps/chosen": -5.181523323059082, "logps/rejected": -7.928874969482422, "loss": 0.6341, "rewards/accuracies": 0.0, "rewards/chosen": 0.6912358403205872, "rewards/margins": -0.460773766040802, "rewards/rejected": 1.1520096063613892, "step": 5018 }, { "epoch": 1.11, "learning_rate": 4.34766520580797e-06, "logits/chosen": -2.41501784324646, "logits/rejected": -2.351003408432007, "logps/chosen": -111.2590560913086, "logps/rejected": -82.92121887207031, "loss": 0.1232, "rewards/accuracies": 1.0, "rewards/chosen": 8.912575721740723, "rewards/margins": 4.3315300941467285, "rewards/rejected": 4.581045627593994, "step": 5019 }, { "epoch": 1.11, "learning_rate": 4.345888239322042e-06, "logits/chosen": -2.1054255962371826, "logits/rejected": -2.04931640625, "logps/chosen": -86.58479309082031, "logps/rejected": -20.412811279296875, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": 8.645207405090332, "rewards/margins": 7.306926250457764, "rewards/rejected": 1.338281273841858, "step": 5020 }, { "epoch": 1.11, "learning_rate": 4.3441113568879046e-06, "logits/chosen": -1.9745445251464844, "logits/rejected": -2.067467212677002, "logps/chosen": -43.70903015136719, "logps/rejected": -164.51068115234375, "loss": 1.2365, "rewards/accuracies": 0.0, "rewards/chosen": 7.759660243988037, "rewards/margins": -2.372025966644287, "rewards/rejected": 10.131686210632324, "step": 5021 }, { "epoch": 1.11, "learning_rate": 4.342334558733878e-06, "logits/chosen": -2.2717816829681396, "logits/rejected": -2.21036696434021, "logps/chosen": -32.7718505859375, "logps/rejected": -17.15804100036621, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": 4.103697299957275, "rewards/margins": 3.0820116996765137, "rewards/rejected": 1.0216856002807617, "step": 5022 }, { "epoch": 1.11, "learning_rate": 4.340557845088281e-06, "logits/chosen": -1.8023275136947632, "logits/rejected": -1.6872848272323608, "logps/chosen": -123.29933166503906, "logps/rejected": -79.72065734863281, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": 8.504918098449707, "rewards/margins": 3.360400676727295, "rewards/rejected": 5.144517421722412, "step": 5023 }, { "epoch": 1.11, "learning_rate": 4.338781216179414e-06, "logits/chosen": -1.7347978353500366, "logits/rejected": -1.7347978353500366, "logps/chosen": -26.934335708618164, "logps/rejected": -26.934335708618164, "loss": 0.3471, "rewards/accuracies": 0.0, "rewards/chosen": 2.8261046409606934, "rewards/margins": 0.0, "rewards/rejected": 2.8261046409606934, "step": 5024 }, { "epoch": 1.11, "learning_rate": 4.337004672235574e-06, "logits/chosen": -2.0783345699310303, "logits/rejected": -2.0173070430755615, "logps/chosen": -122.90325927734375, "logps/rejected": -62.074466705322266, "loss": 0.1223, "rewards/accuracies": 1.0, "rewards/chosen": 7.084054470062256, "rewards/margins": 1.4359230995178223, "rewards/rejected": 5.648131370544434, "step": 5025 }, { "epoch": 1.11, "learning_rate": 4.3352282134850375e-06, "logits/chosen": -1.7717278003692627, "logits/rejected": -1.7035218477249146, "logps/chosen": -33.69111251831055, "logps/rejected": -55.35771942138672, "loss": 0.4194, "rewards/accuracies": 1.0, "rewards/chosen": 3.3824315071105957, "rewards/margins": 0.31859707832336426, "rewards/rejected": 3.0638344287872314, "step": 5026 }, { "epoch": 1.11, "learning_rate": 4.333451840156078e-06, "logits/chosen": -2.26234769821167, "logits/rejected": -2.2625367641448975, "logps/chosen": -25.320594787597656, "logps/rejected": -114.83364868164062, "loss": 0.2337, "rewards/accuracies": 1.0, "rewards/chosen": 3.7579026222229004, "rewards/margins": 1.550755262374878, "rewards/rejected": 2.2071473598480225, "step": 5027 }, { "epoch": 1.11, "learning_rate": 4.3316755524769556e-06, "logits/chosen": -1.940307378768921, "logits/rejected": -1.847084403038025, "logps/chosen": -58.7459716796875, "logps/rejected": -30.026395797729492, "loss": 0.1566, "rewards/accuracies": 1.0, "rewards/chosen": 4.395303249359131, "rewards/margins": 1.0237576961517334, "rewards/rejected": 3.3715455532073975, "step": 5028 }, { "epoch": 1.11, "learning_rate": 4.329899350675918e-06, "logits/chosen": -1.6672204732894897, "logits/rejected": -1.6672204732894897, "logps/chosen": -48.099395751953125, "logps/rejected": -48.099395751953125, "loss": 0.3584, "rewards/accuracies": 0.0, "rewards/chosen": 3.478476047515869, "rewards/margins": 0.0, "rewards/rejected": 3.478476047515869, "step": 5029 }, { "epoch": 1.11, "learning_rate": 4.328123234981202e-06, "logits/chosen": -1.8728843927383423, "logits/rejected": -1.8346965312957764, "logps/chosen": -53.89387130737305, "logps/rejected": -108.79203796386719, "loss": 0.1181, "rewards/accuracies": 1.0, "rewards/chosen": 3.3469836711883545, "rewards/margins": 1.7604182958602905, "rewards/rejected": 1.586565375328064, "step": 5030 }, { "epoch": 1.11, "learning_rate": 4.3263472056210365e-06, "logits/chosen": -2.053999423980713, "logits/rejected": -2.0349385738372803, "logps/chosen": -34.415977478027344, "logps/rejected": -77.19257354736328, "loss": 0.7203, "rewards/accuracies": 0.0, "rewards/chosen": 2.8190715312957764, "rewards/margins": -1.1063108444213867, "rewards/rejected": 3.925382375717163, "step": 5031 }, { "epoch": 1.11, "learning_rate": 4.3245712628236356e-06, "logits/chosen": -2.0153489112854004, "logits/rejected": -2.0342228412628174, "logps/chosen": -50.07414627075195, "logps/rejected": -93.20832824707031, "loss": 0.2679, "rewards/accuracies": 1.0, "rewards/chosen": 3.997755765914917, "rewards/margins": 2.26466703414917, "rewards/rejected": 1.733088731765747, "step": 5032 }, { "epoch": 1.11, "learning_rate": 4.322795406817202e-06, "logits/chosen": -1.5255153179168701, "logits/rejected": -1.5527839660644531, "logps/chosen": -40.00689697265625, "logps/rejected": -42.82080078125, "loss": 0.2036, "rewards/accuracies": 1.0, "rewards/chosen": 3.2766494750976562, "rewards/margins": 1.1076292991638184, "rewards/rejected": 2.169020175933838, "step": 5033 }, { "epoch": 1.11, "learning_rate": 4.321019637829932e-06, "logits/chosen": -1.882099986076355, "logits/rejected": -1.7445011138916016, "logps/chosen": -49.39149475097656, "logps/rejected": -22.60430335998535, "loss": 1.1646, "rewards/accuracies": 1.0, "rewards/chosen": 2.933575391769409, "rewards/margins": 1.6516839265823364, "rewards/rejected": 1.2818914651870728, "step": 5034 }, { "epoch": 1.11, "learning_rate": 4.319243956090005e-06, "logits/chosen": -1.788902759552002, "logits/rejected": -1.7442286014556885, "logps/chosen": -145.8226318359375, "logps/rejected": -94.51691436767578, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 9.347131729125977, "rewards/margins": 2.151369571685791, "rewards/rejected": 7.1957621574401855, "step": 5035 }, { "epoch": 1.11, "learning_rate": 4.317468361825595e-06, "logits/chosen": -1.865601897239685, "logits/rejected": -1.8393969535827637, "logps/chosen": -103.52508544921875, "logps/rejected": -99.11849975585938, "loss": 0.0716, "rewards/accuracies": 1.0, "rewards/chosen": 5.337441921234131, "rewards/margins": 2.2329726219177246, "rewards/rejected": 3.1044692993164062, "step": 5036 }, { "epoch": 1.11, "learning_rate": 4.315692855264859e-06, "logits/chosen": -2.079299211502075, "logits/rejected": -2.079299211502075, "logps/chosen": -41.086280822753906, "logps/rejected": -41.086280822753906, "loss": 0.3612, "rewards/accuracies": 0.0, "rewards/chosen": 3.4480950832366943, "rewards/margins": 0.0, "rewards/rejected": 3.4480950832366943, "step": 5037 }, { "epoch": 1.12, "learning_rate": 4.313917436635947e-06, "logits/chosen": -1.7649425268173218, "logits/rejected": -1.7198536396026611, "logps/chosen": -27.251798629760742, "logps/rejected": -7.76318359375, "loss": 0.2857, "rewards/accuracies": 1.0, "rewards/chosen": 2.5785272121429443, "rewards/margins": 1.2862639427185059, "rewards/rejected": 1.2922632694244385, "step": 5038 }, { "epoch": 1.12, "learning_rate": 4.312142106166992e-06, "logits/chosen": -2.229220390319824, "logits/rejected": -2.2630558013916016, "logps/chosen": -59.5711669921875, "logps/rejected": -91.09228515625, "loss": 0.173, "rewards/accuracies": 1.0, "rewards/chosen": 8.3848237991333, "rewards/margins": 1.0604567527770996, "rewards/rejected": 7.324367046356201, "step": 5039 }, { "epoch": 1.12, "learning_rate": 4.310366864086127e-06, "logits/chosen": -1.5991071462631226, "logits/rejected": -1.6480776071548462, "logps/chosen": -15.232508659362793, "logps/rejected": -28.845191955566406, "loss": 0.4087, "rewards/accuracies": 0.0, "rewards/chosen": 3.4408624172210693, "rewards/margins": -0.08144593238830566, "rewards/rejected": 3.522308349609375, "step": 5040 }, { "epoch": 1.12, "learning_rate": 4.308591710621458e-06, "logits/chosen": -2.006045341491699, "logits/rejected": -2.0104453563690186, "logps/chosen": -108.44941711425781, "logps/rejected": -241.0243377685547, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 9.738235473632812, "rewards/margins": 2.817521572113037, "rewards/rejected": 6.920713901519775, "step": 5041 }, { "epoch": 1.12, "learning_rate": 4.306816646001095e-06, "logits/chosen": -1.981653094291687, "logits/rejected": -1.986466407775879, "logps/chosen": -35.013092041015625, "logps/rejected": -31.69501304626465, "loss": 0.4476, "rewards/accuracies": 0.0, "rewards/chosen": 2.7337090969085693, "rewards/margins": -0.28489232063293457, "rewards/rejected": 3.018601417541504, "step": 5042 }, { "epoch": 1.12, "learning_rate": 4.305041670453127e-06, "logits/chosen": -2.2366249561309814, "logits/rejected": -2.1596786975860596, "logps/chosen": -64.13704681396484, "logps/rejected": -24.154476165771484, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": 3.107869863510132, "rewards/margins": 2.2443037033081055, "rewards/rejected": 0.8635662198066711, "step": 5043 }, { "epoch": 1.12, "learning_rate": 4.303266784205632e-06, "logits/chosen": -1.9881396293640137, "logits/rejected": -1.9881396293640137, "logps/chosen": -28.163318634033203, "logps/rejected": -28.163318634033203, "loss": 1.1887, "rewards/accuracies": 0.0, "rewards/chosen": 4.540998458862305, "rewards/margins": 0.0, "rewards/rejected": 4.540998458862305, "step": 5044 }, { "epoch": 1.12, "learning_rate": 4.301491987486681e-06, "logits/chosen": -1.702628254890442, "logits/rejected": -1.5967122316360474, "logps/chosen": -58.572181701660156, "logps/rejected": -19.643110275268555, "loss": 0.1142, "rewards/accuracies": 1.0, "rewards/chosen": 2.1434578895568848, "rewards/margins": 1.3958516120910645, "rewards/rejected": 0.7476062774658203, "step": 5045 }, { "epoch": 1.12, "learning_rate": 4.299717280524329e-06, "logits/chosen": -1.6500356197357178, "logits/rejected": -1.5197134017944336, "logps/chosen": -53.19758224487305, "logps/rejected": -21.806324005126953, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": 4.115993022918701, "rewards/margins": 2.5630455017089844, "rewards/rejected": 1.5529476404190063, "step": 5046 }, { "epoch": 1.12, "learning_rate": 4.297942663546626e-06, "logits/chosen": -1.8720165491104126, "logits/rejected": -1.8696211576461792, "logps/chosen": -28.655258178710938, "logps/rejected": -57.18732452392578, "loss": 0.2458, "rewards/accuracies": 1.0, "rewards/chosen": 3.1553666591644287, "rewards/margins": 0.6951196193695068, "rewards/rejected": 2.460247039794922, "step": 5047 }, { "epoch": 1.12, "learning_rate": 4.2961681367816e-06, "logits/chosen": -1.783370852470398, "logits/rejected": -1.7687212228775024, "logps/chosen": -39.02275848388672, "logps/rejected": -68.357421875, "loss": 0.4983, "rewards/accuracies": 1.0, "rewards/chosen": 3.5288681983947754, "rewards/margins": 1.4168212413787842, "rewards/rejected": 2.112046957015991, "step": 5048 }, { "epoch": 1.12, "learning_rate": 4.294393700457279e-06, "logits/chosen": -1.6534227132797241, "logits/rejected": -1.6274471282958984, "logps/chosen": -68.36553192138672, "logps/rejected": -57.992618560791016, "loss": 0.7629, "rewards/accuracies": 1.0, "rewards/chosen": 4.796144962310791, "rewards/margins": 4.320603847503662, "rewards/rejected": 0.4755413234233856, "step": 5049 }, { "epoch": 1.12, "learning_rate": 4.292619354801668e-06, "logits/chosen": -1.7916077375411987, "logits/rejected": -1.9037584066390991, "logps/chosen": -53.665130615234375, "logps/rejected": -100.20967102050781, "loss": 1.9507, "rewards/accuracies": 0.0, "rewards/chosen": 5.7402873039245605, "rewards/margins": -3.669219493865967, "rewards/rejected": 9.409506797790527, "step": 5050 }, { "epoch": 1.12, "learning_rate": 4.290845100042772e-06, "logits/chosen": -2.034189462661743, "logits/rejected": -2.000089168548584, "logps/chosen": -46.37519073486328, "logps/rejected": -60.806663513183594, "loss": 0.1602, "rewards/accuracies": 1.0, "rewards/chosen": 3.8220016956329346, "rewards/margins": 2.265427589416504, "rewards/rejected": 1.5565742254257202, "step": 5051 }, { "epoch": 1.12, "learning_rate": 4.289070936408576e-06, "logits/chosen": -1.9007858037948608, "logits/rejected": -1.9007858037948608, "logps/chosen": -42.28852462768555, "logps/rejected": -42.28852462768555, "loss": 0.3467, "rewards/accuracies": 0.0, "rewards/chosen": 4.402061939239502, "rewards/margins": 0.0, "rewards/rejected": 4.402061939239502, "step": 5052 }, { "epoch": 1.12, "learning_rate": 4.287296864127055e-06, "logits/chosen": -1.7702815532684326, "logits/rejected": -1.7028182744979858, "logps/chosen": -115.83055114746094, "logps/rejected": -96.75115966796875, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 6.476553440093994, "rewards/margins": 2.872798204421997, "rewards/rejected": 3.603755235671997, "step": 5053 }, { "epoch": 1.12, "learning_rate": 4.285522883426174e-06, "logits/chosen": -2.0024828910827637, "logits/rejected": -1.9928019046783447, "logps/chosen": -89.58883666992188, "logps/rejected": -111.25125885009766, "loss": 0.289, "rewards/accuracies": 1.0, "rewards/chosen": 9.399774551391602, "rewards/margins": 0.2738227844238281, "rewards/rejected": 9.125951766967773, "step": 5054 }, { "epoch": 1.12, "learning_rate": 4.283748994533884e-06, "logits/chosen": -2.2106471061706543, "logits/rejected": -2.237658977508545, "logps/chosen": -23.506084442138672, "logps/rejected": -55.98362731933594, "loss": 0.7692, "rewards/accuracies": 0.0, "rewards/chosen": 4.072425365447998, "rewards/margins": -0.4743785858154297, "rewards/rejected": 4.546803951263428, "step": 5055 }, { "epoch": 1.12, "learning_rate": 4.281975197678127e-06, "logits/chosen": -1.7778520584106445, "logits/rejected": -1.76406729221344, "logps/chosen": -53.083465576171875, "logps/rejected": -41.149375915527344, "loss": 0.7152, "rewards/accuracies": 1.0, "rewards/chosen": 4.217964172363281, "rewards/margins": 0.7474722862243652, "rewards/rejected": 3.470491886138916, "step": 5056 }, { "epoch": 1.12, "learning_rate": 4.2802014930868305e-06, "logits/chosen": -1.8376209735870361, "logits/rejected": -1.7786903381347656, "logps/chosen": -44.62615203857422, "logps/rejected": -41.3985710144043, "loss": 0.3632, "rewards/accuracies": 0.0, "rewards/chosen": 6.423938751220703, "rewards/margins": -0.013463497161865234, "rewards/rejected": 6.437402248382568, "step": 5057 }, { "epoch": 1.12, "learning_rate": 4.278427880987912e-06, "logits/chosen": -1.8033322095870972, "logits/rejected": -1.8012443780899048, "logps/chosen": -48.856571197509766, "logps/rejected": -59.04926681518555, "loss": 0.3734, "rewards/accuracies": 1.0, "rewards/chosen": 3.666644811630249, "rewards/margins": 0.31444716453552246, "rewards/rejected": 3.3521976470947266, "step": 5058 }, { "epoch": 1.12, "learning_rate": 4.276654361609276e-06, "logits/chosen": -1.9189778566360474, "logits/rejected": -1.9094687700271606, "logps/chosen": -48.17250061035156, "logps/rejected": -83.27609252929688, "loss": 0.4412, "rewards/accuracies": 1.0, "rewards/chosen": 4.161797523498535, "rewards/margins": 1.5688378810882568, "rewards/rejected": 2.5929596424102783, "step": 5059 }, { "epoch": 1.12, "learning_rate": 4.274880935178817e-06, "logits/chosen": -1.8729164600372314, "logits/rejected": -1.8495970964431763, "logps/chosen": -37.308074951171875, "logps/rejected": -33.584877014160156, "loss": 0.1847, "rewards/accuracies": 1.0, "rewards/chosen": 3.015364170074463, "rewards/margins": 1.1661919355392456, "rewards/rejected": 1.8491722345352173, "step": 5060 }, { "epoch": 1.12, "learning_rate": 4.273107601924413e-06, "logits/chosen": -1.9367661476135254, "logits/rejected": -1.9317060708999634, "logps/chosen": -124.83213806152344, "logps/rejected": -75.61512756347656, "loss": 0.9124, "rewards/accuracies": 0.0, "rewards/chosen": 5.164659023284912, "rewards/margins": -0.0606074333190918, "rewards/rejected": 5.225266456604004, "step": 5061 }, { "epoch": 1.12, "learning_rate": 4.271334362073937e-06, "logits/chosen": -1.8441721200942993, "logits/rejected": -1.8441721200942993, "logps/chosen": -25.32547378540039, "logps/rejected": -25.32547378540039, "loss": 0.6111, "rewards/accuracies": 0.0, "rewards/chosen": 2.3896889686584473, "rewards/margins": 0.0, "rewards/rejected": 2.3896889686584473, "step": 5062 }, { "epoch": 1.12, "learning_rate": 4.269561215855243e-06, "logits/chosen": -2.060642719268799, "logits/rejected": -1.9797172546386719, "logps/chosen": -129.08843994140625, "logps/rejected": -39.41284942626953, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 6.358960151672363, "rewards/margins": 3.5140740871429443, "rewards/rejected": 2.844886064529419, "step": 5063 }, { "epoch": 1.12, "learning_rate": 4.2677881634961745e-06, "logits/chosen": -1.9795169830322266, "logits/rejected": -1.9775389432907104, "logps/chosen": -66.95049285888672, "logps/rejected": -58.78969955444336, "loss": 0.1585, "rewards/accuracies": 1.0, "rewards/chosen": 3.540731191635132, "rewards/margins": 1.0000889301300049, "rewards/rejected": 2.540642261505127, "step": 5064 }, { "epoch": 1.12, "learning_rate": 4.26601520522457e-06, "logits/chosen": -1.980310082435608, "logits/rejected": -1.9908796548843384, "logps/chosen": -70.41638946533203, "logps/rejected": -61.194923400878906, "loss": 0.3158, "rewards/accuracies": 1.0, "rewards/chosen": 5.869927406311035, "rewards/margins": 0.739814281463623, "rewards/rejected": 5.130113124847412, "step": 5065 }, { "epoch": 1.12, "learning_rate": 4.264242341268243e-06, "logits/chosen": -1.850286602973938, "logits/rejected": -1.9144833087921143, "logps/chosen": -43.924888610839844, "logps/rejected": -120.90885162353516, "loss": 0.9696, "rewards/accuracies": 0.0, "rewards/chosen": 6.537052154541016, "rewards/margins": -1.7688493728637695, "rewards/rejected": 8.305901527404785, "step": 5066 }, { "epoch": 1.12, "learning_rate": 4.26246957185501e-06, "logits/chosen": -2.1796720027923584, "logits/rejected": -2.187208652496338, "logps/chosen": -64.50659942626953, "logps/rejected": -63.53636169433594, "loss": 0.3593, "rewards/accuracies": 1.0, "rewards/chosen": 7.654415130615234, "rewards/margins": 0.0012211799621582031, "rewards/rejected": 7.653193950653076, "step": 5067 }, { "epoch": 1.12, "learning_rate": 4.260696897212663e-06, "logits/chosen": -1.9794002771377563, "logits/rejected": -1.9897420406341553, "logps/chosen": -112.62031555175781, "logps/rejected": -63.120330810546875, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 6.950433254241943, "rewards/margins": 2.000335693359375, "rewards/rejected": 4.950097560882568, "step": 5068 }, { "epoch": 1.12, "learning_rate": 4.258924317568988e-06, "logits/chosen": -1.947512149810791, "logits/rejected": -1.9023220539093018, "logps/chosen": -96.48948669433594, "logps/rejected": -82.07129669189453, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 6.427255153656006, "rewards/margins": 2.312312126159668, "rewards/rejected": 4.114943027496338, "step": 5069 }, { "epoch": 1.12, "learning_rate": 4.257151833151756e-06, "logits/chosen": -1.8493025302886963, "logits/rejected": -1.791519284248352, "logps/chosen": -50.27220916748047, "logps/rejected": -45.484535217285156, "loss": 0.3808, "rewards/accuracies": 1.0, "rewards/chosen": 3.3461341857910156, "rewards/margins": 0.15215516090393066, "rewards/rejected": 3.193979024887085, "step": 5070 }, { "epoch": 1.12, "learning_rate": 4.255379444188729e-06, "logits/chosen": -2.1048591136932373, "logits/rejected": -2.073822259902954, "logps/chosen": -22.060670852661133, "logps/rejected": -28.956432342529297, "loss": 0.6841, "rewards/accuracies": 0.0, "rewards/chosen": 3.261765241622925, "rewards/margins": -0.6447286605834961, "rewards/rejected": 3.906493902206421, "step": 5071 }, { "epoch": 1.12, "learning_rate": 4.2536071509076535e-06, "logits/chosen": -1.9224933385849, "logits/rejected": -1.7140816450119019, "logps/chosen": -58.15531921386719, "logps/rejected": -51.01206970214844, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 4.0277910232543945, "rewards/margins": 3.8771021366119385, "rewards/rejected": 0.15068893134593964, "step": 5072 }, { "epoch": 1.12, "learning_rate": 4.251834953536262e-06, "logits/chosen": -1.6035982370376587, "logits/rejected": -1.6035982370376587, "logps/chosen": -34.77830505371094, "logps/rejected": -34.77830505371094, "loss": 0.4086, "rewards/accuracies": 0.0, "rewards/chosen": 1.0515003204345703, "rewards/margins": 0.0, "rewards/rejected": 1.0515003204345703, "step": 5073 }, { "epoch": 1.12, "learning_rate": 4.250062852302283e-06, "logits/chosen": -2.0058133602142334, "logits/rejected": -2.0071043968200684, "logps/chosen": -56.56050109863281, "logps/rejected": -91.34300231933594, "loss": 0.9566, "rewards/accuracies": 0.0, "rewards/chosen": 4.532547950744629, "rewards/margins": -1.7055611610412598, "rewards/rejected": 6.238109111785889, "step": 5074 }, { "epoch": 1.12, "learning_rate": 4.2482908474334225e-06, "logits/chosen": -2.000560760498047, "logits/rejected": -1.9570891857147217, "logps/chosen": -109.70924377441406, "logps/rejected": -70.65711975097656, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 6.047725200653076, "rewards/margins": 2.4876558780670166, "rewards/rejected": 3.5600693225860596, "step": 5075 }, { "epoch": 1.12, "learning_rate": 4.246518939157383e-06, "logits/chosen": -1.7337816953659058, "logits/rejected": -1.7337816953659058, "logps/chosen": -42.13660430908203, "logps/rejected": -42.13660430908203, "loss": 0.3466, "rewards/accuracies": 0.0, "rewards/chosen": 2.54913330078125, "rewards/margins": 0.0, "rewards/rejected": 2.54913330078125, "step": 5076 }, { "epoch": 1.12, "learning_rate": 4.244747127701846e-06, "logits/chosen": -1.8005023002624512, "logits/rejected": -1.7444490194320679, "logps/chosen": -76.91554260253906, "logps/rejected": -34.413917541503906, "loss": 0.0963, "rewards/accuracies": 1.0, "rewards/chosen": 7.447839260101318, "rewards/margins": 3.940887212753296, "rewards/rejected": 3.5069520473480225, "step": 5077 }, { "epoch": 1.12, "learning_rate": 4.242975413294491e-06, "logits/chosen": -2.2101023197174072, "logits/rejected": -2.1182470321655273, "logps/chosen": -126.83248138427734, "logps/rejected": -46.492496490478516, "loss": 0.1241, "rewards/accuracies": 1.0, "rewards/chosen": 7.075660228729248, "rewards/margins": 1.3137259483337402, "rewards/rejected": 5.761934280395508, "step": 5078 }, { "epoch": 1.12, "learning_rate": 4.241203796162973e-06, "logits/chosen": -2.119861125946045, "logits/rejected": -2.102630853652954, "logps/chosen": -29.847972869873047, "logps/rejected": -38.81623458862305, "loss": 0.254, "rewards/accuracies": 1.0, "rewards/chosen": 3.3769307136535645, "rewards/margins": 0.5361526012420654, "rewards/rejected": 2.840778112411499, "step": 5079 }, { "epoch": 1.12, "learning_rate": 4.2394322765349456e-06, "logits/chosen": -2.2749273777008057, "logits/rejected": -2.22503662109375, "logps/chosen": -130.58966064453125, "logps/rejected": -60.08542251586914, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": 7.498144626617432, "rewards/margins": 2.282838821411133, "rewards/rejected": 5.215305805206299, "step": 5080 }, { "epoch": 1.12, "learning_rate": 4.237660854638039e-06, "logits/chosen": -1.7804046869277954, "logits/rejected": -1.75593900680542, "logps/chosen": -64.7421875, "logps/rejected": -88.52449035644531, "loss": 0.0685, "rewards/accuracies": 1.0, "rewards/chosen": 7.515683174133301, "rewards/margins": 1.994542121887207, "rewards/rejected": 5.521141052246094, "step": 5081 }, { "epoch": 1.12, "learning_rate": 4.2358895306998825e-06, "logits/chosen": -2.0123538970947266, "logits/rejected": -2.0534136295318604, "logps/chosen": -66.94783782958984, "logps/rejected": -115.89476013183594, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": 7.9658331871032715, "rewards/margins": 4.522795677185059, "rewards/rejected": 3.443037509918213, "step": 5082 }, { "epoch": 1.13, "learning_rate": 4.234118304948085e-06, "logits/chosen": -1.8904951810836792, "logits/rejected": -1.8884931802749634, "logps/chosen": -69.95425415039062, "logps/rejected": -71.38987731933594, "loss": 0.149, "rewards/accuracies": 1.0, "rewards/chosen": 5.743125915527344, "rewards/margins": 1.250636100769043, "rewards/rejected": 4.492489814758301, "step": 5083 }, { "epoch": 1.13, "learning_rate": 4.232347177610241e-06, "logits/chosen": -1.7874618768692017, "logits/rejected": -1.8209186792373657, "logps/chosen": -44.575504302978516, "logps/rejected": -47.577083587646484, "loss": 1.4284, "rewards/accuracies": 0.0, "rewards/chosen": 3.4109532833099365, "rewards/margins": -2.6375367641448975, "rewards/rejected": 6.048490047454834, "step": 5084 }, { "epoch": 1.13, "learning_rate": 4.230576148913943e-06, "logits/chosen": -1.9214563369750977, "logits/rejected": -1.8709102869033813, "logps/chosen": -61.036495208740234, "logps/rejected": -61.67633056640625, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": 5.552200794219971, "rewards/margins": 4.098420143127441, "rewards/rejected": 1.4537804126739502, "step": 5085 }, { "epoch": 1.13, "learning_rate": 4.228805219086757e-06, "logits/chosen": -2.100719451904297, "logits/rejected": -1.7043814659118652, "logps/chosen": -75.47529602050781, "logps/rejected": -153.40187072753906, "loss": 0.6895, "rewards/accuracies": 0.0, "rewards/chosen": 4.609818458557129, "rewards/margins": -1.0052037239074707, "rewards/rejected": 5.6150221824646, "step": 5086 }, { "epoch": 1.13, "learning_rate": 4.227034388356248e-06, "logits/chosen": -1.635508418083191, "logits/rejected": -1.6094623804092407, "logps/chosen": -71.30793762207031, "logps/rejected": -90.93266296386719, "loss": 0.2955, "rewards/accuracies": 1.0, "rewards/chosen": 3.2629647254943848, "rewards/margins": 0.2785193920135498, "rewards/rejected": 2.984445333480835, "step": 5087 }, { "epoch": 1.13, "learning_rate": 4.225263656949961e-06, "logits/chosen": -1.6453667879104614, "logits/rejected": -1.756296992301941, "logps/chosen": -7.6176958084106445, "logps/rejected": -89.90557861328125, "loss": 2.7771, "rewards/accuracies": 0.0, "rewards/chosen": 2.016583204269409, "rewards/margins": -4.660443305969238, "rewards/rejected": 6.677026271820068, "step": 5088 }, { "epoch": 1.13, "learning_rate": 4.223493025095433e-06, "logits/chosen": -1.611398458480835, "logits/rejected": -1.3461086750030518, "logps/chosen": -111.08396911621094, "logps/rejected": -103.77742004394531, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 8.03925609588623, "rewards/margins": 2.308877468109131, "rewards/rejected": 5.7303786277771, "step": 5089 }, { "epoch": 1.13, "learning_rate": 4.221722493020183e-06, "logits/chosen": -1.4928410053253174, "logits/rejected": -1.4928410053253174, "logps/chosen": -24.91777992248535, "logps/rejected": -24.91777992248535, "loss": 0.5509, "rewards/accuracies": 0.0, "rewards/chosen": 2.5653345584869385, "rewards/margins": 0.0, "rewards/rejected": 2.5653345584869385, "step": 5090 }, { "epoch": 1.13, "learning_rate": 4.219952060951723e-06, "logits/chosen": -1.852797031402588, "logits/rejected": -1.8451406955718994, "logps/chosen": -126.6298599243164, "logps/rejected": -87.92913818359375, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 8.551961898803711, "rewards/margins": 2.496209144592285, "rewards/rejected": 6.055752754211426, "step": 5091 }, { "epoch": 1.13, "learning_rate": 4.218181729117547e-06, "logits/chosen": -2.1699934005737305, "logits/rejected": -1.9749363660812378, "logps/chosen": -86.24446105957031, "logps/rejected": -133.77352905273438, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 7.132256984710693, "rewards/margins": 9.107661247253418, "rewards/rejected": -1.9754043817520142, "step": 5092 }, { "epoch": 1.13, "learning_rate": 4.216411497745141e-06, "logits/chosen": -1.8263312578201294, "logits/rejected": -1.7899370193481445, "logps/chosen": -76.72322082519531, "logps/rejected": -102.10356140136719, "loss": 0.062, "rewards/accuracies": 1.0, "rewards/chosen": 8.04609203338623, "rewards/margins": 5.5109944343566895, "rewards/rejected": 2.535097599029541, "step": 5093 }, { "epoch": 1.13, "learning_rate": 4.214641367061974e-06, "logits/chosen": -1.9845271110534668, "logits/rejected": -1.9270609617233276, "logps/chosen": -129.57308959960938, "logps/rejected": -76.59738159179688, "loss": 0.081, "rewards/accuracies": 1.0, "rewards/chosen": 6.364843845367432, "rewards/margins": 1.749760627746582, "rewards/rejected": 4.61508321762085, "step": 5094 }, { "epoch": 1.13, "learning_rate": 4.212871337295502e-06, "logits/chosen": -1.844435453414917, "logits/rejected": -1.8383514881134033, "logps/chosen": -34.658729553222656, "logps/rejected": -76.74407958984375, "loss": 1.0894, "rewards/accuracies": 0.0, "rewards/chosen": 2.621324300765991, "rewards/margins": -0.8742415904998779, "rewards/rejected": 3.495565891265869, "step": 5095 }, { "epoch": 1.13, "learning_rate": 4.211101408673172e-06, "logits/chosen": -2.025907039642334, "logits/rejected": -1.908378005027771, "logps/chosen": -77.85160064697266, "logps/rejected": -125.55038452148438, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 11.159804344177246, "rewards/margins": 3.171907424926758, "rewards/rejected": 7.987896919250488, "step": 5096 }, { "epoch": 1.13, "learning_rate": 4.2093315814224126e-06, "logits/chosen": -1.7076220512390137, "logits/rejected": -1.779171347618103, "logps/chosen": -39.477500915527344, "logps/rejected": -94.76336669921875, "loss": 0.8857, "rewards/accuracies": 0.0, "rewards/chosen": 8.896055221557617, "rewards/margins": -1.5693788528442383, "rewards/rejected": 10.465434074401855, "step": 5097 }, { "epoch": 1.13, "learning_rate": 4.207561855770646e-06, "logits/chosen": -1.966244101524353, "logits/rejected": -1.9535623788833618, "logps/chosen": -36.62277603149414, "logps/rejected": -55.27174377441406, "loss": 0.2283, "rewards/accuracies": 1.0, "rewards/chosen": 3.2042758464813232, "rewards/margins": 0.6014118194580078, "rewards/rejected": 2.6028640270233154, "step": 5098 }, { "epoch": 1.13, "learning_rate": 4.205792231945274e-06, "logits/chosen": -2.047168493270874, "logits/rejected": -2.078989267349243, "logps/chosen": -125.5490951538086, "logps/rejected": -180.60940551757812, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 12.633736610412598, "rewards/margins": 4.809142589569092, "rewards/rejected": 7.824594020843506, "step": 5099 }, { "epoch": 1.13, "learning_rate": 4.204022710173694e-06, "logits/chosen": -1.7272368669509888, "logits/rejected": -1.7272368669509888, "logps/chosen": -65.81538391113281, "logps/rejected": -65.81538391113281, "loss": 0.3748, "rewards/accuracies": 0.0, "rewards/chosen": 5.9658074378967285, "rewards/margins": 0.0, "rewards/rejected": 5.9658074378967285, "step": 5100 }, { "epoch": 1.13, "learning_rate": 4.202253290683279e-06, "logits/chosen": -2.0699303150177, "logits/rejected": -1.8142000436782837, "logps/chosen": -79.51541900634766, "logps/rejected": -176.90414428710938, "loss": 0.1277, "rewards/accuracies": 1.0, "rewards/chosen": 8.33364486694336, "rewards/margins": 1.9655098915100098, "rewards/rejected": 6.36813497543335, "step": 5101 }, { "epoch": 1.13, "learning_rate": 4.200483973701401e-06, "logits/chosen": -1.899723768234253, "logits/rejected": -1.7065924406051636, "logps/chosen": -94.32101440429688, "logps/rejected": -18.597434997558594, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 6.292651653289795, "rewards/margins": 5.906242847442627, "rewards/rejected": 0.38640880584716797, "step": 5102 }, { "epoch": 1.13, "learning_rate": 4.19871475945541e-06, "logits/chosen": -1.8642489910125732, "logits/rejected": -1.8642489910125732, "logps/chosen": -26.72443389892578, "logps/rejected": -26.72443389892578, "loss": 0.3544, "rewards/accuracies": 0.0, "rewards/chosen": 2.4731814861297607, "rewards/margins": 0.0, "rewards/rejected": 2.4731814861297607, "step": 5103 }, { "epoch": 1.13, "learning_rate": 4.196945648172646e-06, "logits/chosen": -1.9420338869094849, "logits/rejected": -1.950301170349121, "logps/chosen": -33.66117858886719, "logps/rejected": -54.04899597167969, "loss": 0.1794, "rewards/accuracies": 1.0, "rewards/chosen": 4.435678005218506, "rewards/margins": 0.8641173839569092, "rewards/rejected": 3.5715606212615967, "step": 5104 }, { "epoch": 1.13, "learning_rate": 4.195176640080436e-06, "logits/chosen": -1.9012324810028076, "logits/rejected": -1.9828060865402222, "logps/chosen": -43.041419982910156, "logps/rejected": -125.38809204101562, "loss": 2.0941, "rewards/accuracies": 0.0, "rewards/chosen": 4.477773189544678, "rewards/margins": -4.007115840911865, "rewards/rejected": 8.484889030456543, "step": 5105 }, { "epoch": 1.13, "learning_rate": 4.193407735406091e-06, "logits/chosen": -2.1102094650268555, "logits/rejected": -2.05794358253479, "logps/chosen": -135.219970703125, "logps/rejected": -42.21924591064453, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": 6.385859966278076, "rewards/margins": 3.0172433853149414, "rewards/rejected": 3.3686165809631348, "step": 5106 }, { "epoch": 1.13, "learning_rate": 4.191638934376915e-06, "logits/chosen": -1.6774698495864868, "logits/rejected": -1.6774698495864868, "logps/chosen": -18.351531982421875, "logps/rejected": -18.351531982421875, "loss": 0.8022, "rewards/accuracies": 0.0, "rewards/chosen": 1.8276824951171875, "rewards/margins": 0.0, "rewards/rejected": 1.8276824951171875, "step": 5107 }, { "epoch": 1.13, "learning_rate": 4.18987023722019e-06, "logits/chosen": -1.767181634902954, "logits/rejected": -1.6961220502853394, "logps/chosen": -35.94816589355469, "logps/rejected": -43.07787322998047, "loss": 0.1655, "rewards/accuracies": 1.0, "rewards/chosen": 4.188476085662842, "rewards/margins": 1.1374881267547607, "rewards/rejected": 3.050987958908081, "step": 5108 }, { "epoch": 1.13, "learning_rate": 4.1881016441631955e-06, "logits/chosen": -1.934801697731018, "logits/rejected": -1.875234603881836, "logps/chosen": -65.58705139160156, "logps/rejected": -69.92807006835938, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 5.741220951080322, "rewards/margins": 2.4684982299804688, "rewards/rejected": 3.2727227210998535, "step": 5109 }, { "epoch": 1.13, "learning_rate": 4.1863331554331855e-06, "logits/chosen": -1.8228363990783691, "logits/rejected": -1.732853651046753, "logps/chosen": -97.7298355102539, "logps/rejected": -48.207679748535156, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 7.1177544593811035, "rewards/margins": 3.9015090465545654, "rewards/rejected": 3.216245412826538, "step": 5110 }, { "epoch": 1.13, "learning_rate": 4.184564771257411e-06, "logits/chosen": -2.104572296142578, "logits/rejected": -1.569075345993042, "logps/chosen": -134.70889282226562, "logps/rejected": -77.58465576171875, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": 7.36488676071167, "rewards/margins": 3.7455873489379883, "rewards/rejected": 3.6192994117736816, "step": 5111 }, { "epoch": 1.13, "learning_rate": 4.182796491863101e-06, "logits/chosen": -1.9427913427352905, "logits/rejected": -1.9383336305618286, "logps/chosen": -34.59339904785156, "logps/rejected": -61.3993034362793, "loss": 0.2475, "rewards/accuracies": 1.0, "rewards/chosen": 3.700646162033081, "rewards/margins": 0.4850146770477295, "rewards/rejected": 3.2156314849853516, "step": 5112 }, { "epoch": 1.13, "learning_rate": 4.1810283174774805e-06, "logits/chosen": -1.6525968313217163, "logits/rejected": -1.4996446371078491, "logps/chosen": -67.67210388183594, "logps/rejected": -8.086502075195312, "loss": 0.074, "rewards/accuracies": 1.0, "rewards/chosen": 3.5072312355041504, "rewards/margins": 2.441775321960449, "rewards/rejected": 1.0654557943344116, "step": 5113 }, { "epoch": 1.13, "learning_rate": 4.179260248327751e-06, "logits/chosen": -2.024705648422241, "logits/rejected": -2.0015928745269775, "logps/chosen": -55.559120178222656, "logps/rejected": -52.844451904296875, "loss": 0.1074, "rewards/accuracies": 1.0, "rewards/chosen": 4.594698429107666, "rewards/margins": 1.6868431568145752, "rewards/rejected": 2.907855272293091, "step": 5114 }, { "epoch": 1.13, "learning_rate": 4.177492284641105e-06, "logits/chosen": -1.6730716228485107, "logits/rejected": -1.7510799169540405, "logps/chosen": -45.199432373046875, "logps/rejected": -85.92292785644531, "loss": 1.0541, "rewards/accuracies": 0.0, "rewards/chosen": 3.5771148204803467, "rewards/margins": -1.7820298671722412, "rewards/rejected": 5.359144687652588, "step": 5115 }, { "epoch": 1.13, "learning_rate": 4.175724426644724e-06, "logits/chosen": -2.0458178520202637, "logits/rejected": -1.9598532915115356, "logps/chosen": -120.52685546875, "logps/rejected": -134.44686889648438, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": 10.209701538085938, "rewards/margins": 5.106210231781006, "rewards/rejected": 5.103491306304932, "step": 5116 }, { "epoch": 1.13, "learning_rate": 4.173956674565773e-06, "logits/chosen": -1.8127717971801758, "logits/rejected": -1.8370145559310913, "logps/chosen": -80.55973052978516, "logps/rejected": -67.64825439453125, "loss": 0.4288, "rewards/accuracies": 0.0, "rewards/chosen": 6.550843238830566, "rewards/margins": -0.28146886825561523, "rewards/rejected": 6.832312107086182, "step": 5117 }, { "epoch": 1.13, "learning_rate": 4.172189028631404e-06, "logits/chosen": -2.1539671421051025, "logits/rejected": -2.1368496417999268, "logps/chosen": -60.97663879394531, "logps/rejected": -89.46399688720703, "loss": 1.9307, "rewards/accuracies": 1.0, "rewards/chosen": 4.724861145019531, "rewards/margins": 2.127300262451172, "rewards/rejected": 2.5975608825683594, "step": 5118 }, { "epoch": 1.13, "learning_rate": 4.170421489068753e-06, "logits/chosen": -2.1166205406188965, "logits/rejected": -2.038151979446411, "logps/chosen": -46.91345977783203, "logps/rejected": -6.119424819946289, "loss": 0.1732, "rewards/accuracies": 1.0, "rewards/chosen": 2.0668320655822754, "rewards/margins": 1.0270676612854004, "rewards/rejected": 1.039764404296875, "step": 5119 }, { "epoch": 1.13, "learning_rate": 4.168654056104948e-06, "logits/chosen": -2.0734124183654785, "logits/rejected": -2.043696641921997, "logps/chosen": -31.804500579833984, "logps/rejected": -35.57624816894531, "loss": 0.244, "rewards/accuracies": 1.0, "rewards/chosen": 3.5532643795013428, "rewards/margins": 0.9453561305999756, "rewards/rejected": 2.607908248901367, "step": 5120 }, { "epoch": 1.13, "learning_rate": 4.166886729967098e-06, "logits/chosen": -2.3184359073638916, "logits/rejected": -2.2563247680664062, "logps/chosen": -80.55766296386719, "logps/rejected": -35.176170349121094, "loss": 0.2348, "rewards/accuracies": 1.0, "rewards/chosen": 3.8882126808166504, "rewards/margins": 2.561467409133911, "rewards/rejected": 1.3267452716827393, "step": 5121 }, { "epoch": 1.13, "learning_rate": 4.165119510882301e-06, "logits/chosen": -2.0206167697906494, "logits/rejected": -2.0040316581726074, "logps/chosen": -102.9859619140625, "logps/rejected": -126.89094543457031, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": 8.040249824523926, "rewards/margins": 3.137944221496582, "rewards/rejected": 4.902305603027344, "step": 5122 }, { "epoch": 1.13, "learning_rate": 4.1633523990776395e-06, "logits/chosen": -1.9252119064331055, "logits/rejected": -1.9083486795425415, "logps/chosen": -52.448646545410156, "logps/rejected": -81.52320098876953, "loss": 0.3633, "rewards/accuracies": 1.0, "rewards/chosen": 6.488372325897217, "rewards/margins": 0.3062591552734375, "rewards/rejected": 6.182113170623779, "step": 5123 }, { "epoch": 1.13, "learning_rate": 4.161585394780182e-06, "logits/chosen": -1.8235862255096436, "logits/rejected": -1.714087963104248, "logps/chosen": -36.96450424194336, "logps/rejected": -18.751358032226562, "loss": 1.3496, "rewards/accuracies": 1.0, "rewards/chosen": 2.0889217853546143, "rewards/margins": 1.0519342422485352, "rewards/rejected": 1.036987543106079, "step": 5124 }, { "epoch": 1.13, "learning_rate": 4.159818498216988e-06, "logits/chosen": -2.080514669418335, "logits/rejected": -2.1055493354797363, "logps/chosen": -55.16324996948242, "logps/rejected": -79.32780456542969, "loss": 0.2601, "rewards/accuracies": 1.0, "rewards/chosen": 5.896988391876221, "rewards/margins": 0.47028684616088867, "rewards/rejected": 5.426701545715332, "step": 5125 }, { "epoch": 1.13, "learning_rate": 4.158051709615095e-06, "logits/chosen": -1.8797645568847656, "logits/rejected": -1.9174118041992188, "logps/chosen": -55.26347351074219, "logps/rejected": -92.024658203125, "loss": 1.5223, "rewards/accuracies": 0.0, "rewards/chosen": 5.118767738342285, "rewards/margins": -1.8032784461975098, "rewards/rejected": 6.922046184539795, "step": 5126 }, { "epoch": 1.13, "learning_rate": 4.1562850292015355e-06, "logits/chosen": -1.9317134618759155, "logits/rejected": -2.028846025466919, "logps/chosen": -90.69148254394531, "logps/rejected": -146.1536865234375, "loss": 0.3149, "rewards/accuracies": 1.0, "rewards/chosen": 9.640373229980469, "rewards/margins": 0.2088298797607422, "rewards/rejected": 9.431543350219727, "step": 5127 }, { "epoch": 1.14, "learning_rate": 4.15451845720332e-06, "logits/chosen": -1.8683967590332031, "logits/rejected": -1.8347588777542114, "logps/chosen": -110.80021667480469, "logps/rejected": -103.80717468261719, "loss": 0.2628, "rewards/accuracies": 1.0, "rewards/chosen": 6.942945957183838, "rewards/margins": 0.6427488327026367, "rewards/rejected": 6.300197124481201, "step": 5128 }, { "epoch": 1.14, "learning_rate": 4.152751993847452e-06, "logits/chosen": -1.9857712984085083, "logits/rejected": -1.9617559909820557, "logps/chosen": -34.58665466308594, "logps/rejected": -76.49226379394531, "loss": 0.201, "rewards/accuracies": 1.0, "rewards/chosen": 4.162890911102295, "rewards/margins": 0.717228889465332, "rewards/rejected": 3.445662021636963, "step": 5129 }, { "epoch": 1.14, "learning_rate": 4.150985639360914e-06, "logits/chosen": -1.8587135076522827, "logits/rejected": -1.8368006944656372, "logps/chosen": -63.34757995605469, "logps/rejected": -48.65074920654297, "loss": 0.095, "rewards/accuracies": 1.0, "rewards/chosen": 4.620275020599365, "rewards/margins": 1.688859462738037, "rewards/rejected": 2.931415557861328, "step": 5130 }, { "epoch": 1.14, "learning_rate": 4.149219393970682e-06, "logits/chosen": -1.9878807067871094, "logits/rejected": -1.9878807067871094, "logps/chosen": -51.971736907958984, "logps/rejected": -51.971736907958984, "loss": 0.36, "rewards/accuracies": 0.0, "rewards/chosen": 6.2835187911987305, "rewards/margins": 0.0, "rewards/rejected": 6.2835187911987305, "step": 5131 }, { "epoch": 1.14, "learning_rate": 4.147453257903711e-06, "logits/chosen": -1.6709340810775757, "logits/rejected": -1.3951691389083862, "logps/chosen": -99.07955932617188, "logps/rejected": -83.66288757324219, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": 10.211207389831543, "rewards/margins": 3.1538329124450684, "rewards/rejected": 7.057374477386475, "step": 5132 }, { "epoch": 1.14, "learning_rate": 4.145687231386948e-06, "logits/chosen": -1.9691792726516724, "logits/rejected": -1.9546750783920288, "logps/chosen": -35.092079162597656, "logps/rejected": -44.241127014160156, "loss": 0.5616, "rewards/accuracies": 0.0, "rewards/chosen": 2.604330539703369, "rewards/margins": -0.10457682609558105, "rewards/rejected": 2.70890736579895, "step": 5133 }, { "epoch": 1.14, "learning_rate": 4.143921314647323e-06, "logits/chosen": -1.9522590637207031, "logits/rejected": -2.0110011100769043, "logps/chosen": -88.68580627441406, "logps/rejected": -160.2394256591797, "loss": 0.0822, "rewards/accuracies": 1.0, "rewards/chosen": 10.655618667602539, "rewards/margins": 2.59696102142334, "rewards/rejected": 8.0586576461792, "step": 5134 }, { "epoch": 1.14, "learning_rate": 4.142155507911748e-06, "logits/chosen": -1.8696262836456299, "logits/rejected": -1.7024255990982056, "logps/chosen": -96.86436462402344, "logps/rejected": -39.15642166137695, "loss": 0.0592, "rewards/accuracies": 1.0, "rewards/chosen": 5.973625183105469, "rewards/margins": 2.1289234161376953, "rewards/rejected": 3.8447017669677734, "step": 5135 }, { "epoch": 1.14, "learning_rate": 4.140389811407132e-06, "logits/chosen": -2.0203309059143066, "logits/rejected": -2.0120832920074463, "logps/chosen": -82.22235870361328, "logps/rejected": -64.08596801757812, "loss": 0.1904, "rewards/accuracies": 1.0, "rewards/chosen": 3.927382707595825, "rewards/margins": 0.8733489513397217, "rewards/rejected": 3.0540337562561035, "step": 5136 }, { "epoch": 1.14, "learning_rate": 4.1386242253603555e-06, "logits/chosen": -1.6287785768508911, "logits/rejected": -1.6268067359924316, "logps/chosen": -160.0697784423828, "logps/rejected": -63.84864807128906, "loss": 0.1936, "rewards/accuracies": 1.0, "rewards/chosen": 5.709300518035889, "rewards/margins": 0.8808655738830566, "rewards/rejected": 4.828434944152832, "step": 5137 }, { "epoch": 1.14, "learning_rate": 4.136858749998298e-06, "logits/chosen": -1.3340493440628052, "logits/rejected": -1.2466061115264893, "logps/chosen": -30.754392623901367, "logps/rejected": -9.339374542236328, "loss": 0.187, "rewards/accuracies": 1.0, "rewards/chosen": 2.688170909881592, "rewards/margins": 0.866086483001709, "rewards/rejected": 1.8220844268798828, "step": 5138 }, { "epoch": 1.14, "learning_rate": 4.135093385547814e-06, "logits/chosen": -2.0704944133758545, "logits/rejected": -1.9922678470611572, "logps/chosen": -41.277557373046875, "logps/rejected": -7.785378456115723, "loss": 0.0843, "rewards/accuracies": 1.0, "rewards/chosen": 2.553542375564575, "rewards/margins": 1.7137677669525146, "rewards/rejected": 0.8397746086120605, "step": 5139 }, { "epoch": 1.14, "learning_rate": 4.133328132235752e-06, "logits/chosen": -2.1709840297698975, "logits/rejected": -2.1378061771392822, "logps/chosen": -64.97688293457031, "logps/rejected": -103.24495697021484, "loss": 0.1514, "rewards/accuracies": 1.0, "rewards/chosen": 9.70294189453125, "rewards/margins": 1.3505678176879883, "rewards/rejected": 8.352374076843262, "step": 5140 }, { "epoch": 1.14, "learning_rate": 4.13156299028894e-06, "logits/chosen": -2.029972791671753, "logits/rejected": -1.9872616529464722, "logps/chosen": -30.930362701416016, "logps/rejected": -40.264305114746094, "loss": 0.2126, "rewards/accuracies": 1.0, "rewards/chosen": 3.575652837753296, "rewards/margins": 0.770251989364624, "rewards/rejected": 2.805400848388672, "step": 5141 }, { "epoch": 1.14, "learning_rate": 4.129797959934198e-06, "logits/chosen": -1.7215676307678223, "logits/rejected": -1.696061372756958, "logps/chosen": -69.95386505126953, "logps/rejected": -46.131980895996094, "loss": 0.2699, "rewards/accuracies": 1.0, "rewards/chosen": 5.8438239097595215, "rewards/margins": 0.34244298934936523, "rewards/rejected": 5.501380920410156, "step": 5142 }, { "epoch": 1.14, "learning_rate": 4.128033041398324e-06, "logits/chosen": -1.925588607788086, "logits/rejected": -1.5410363674163818, "logps/chosen": -55.00972366333008, "logps/rejected": -126.57523345947266, "loss": 0.15, "rewards/accuracies": 1.0, "rewards/chosen": 4.2391276359558105, "rewards/margins": 1.8242051601409912, "rewards/rejected": 2.4149224758148193, "step": 5143 }, { "epoch": 1.14, "learning_rate": 4.12626823490811e-06, "logits/chosen": -1.7378699779510498, "logits/rejected": -1.738569736480713, "logps/chosen": -54.990333557128906, "logps/rejected": -64.05429077148438, "loss": 0.2134, "rewards/accuracies": 1.0, "rewards/chosen": 2.5169289112091064, "rewards/margins": 1.0445541143417358, "rewards/rejected": 1.4723747968673706, "step": 5144 }, { "epoch": 1.14, "learning_rate": 4.124503540690329e-06, "logits/chosen": -1.893164873123169, "logits/rejected": -1.8527852296829224, "logps/chosen": -49.325294494628906, "logps/rejected": -55.09276580810547, "loss": 0.2028, "rewards/accuracies": 1.0, "rewards/chosen": 4.082240581512451, "rewards/margins": 0.7007288932800293, "rewards/rejected": 3.381511688232422, "step": 5145 }, { "epoch": 1.14, "learning_rate": 4.122738958971736e-06, "logits/chosen": -1.7928835153579712, "logits/rejected": -1.816956877708435, "logps/chosen": -130.4805908203125, "logps/rejected": -85.01350402832031, "loss": 0.3653, "rewards/accuracies": 1.0, "rewards/chosen": 6.4294281005859375, "rewards/margins": 3.2632668018341064, "rewards/rejected": 3.166161298751831, "step": 5146 }, { "epoch": 1.14, "learning_rate": 4.1209744899790796e-06, "logits/chosen": -2.0913524627685547, "logits/rejected": -2.1227872371673584, "logps/chosen": -50.51948547363281, "logps/rejected": -117.11347961425781, "loss": 1.7822, "rewards/accuracies": 0.0, "rewards/chosen": 5.055543422698975, "rewards/margins": -3.5252137184143066, "rewards/rejected": 8.580757141113281, "step": 5147 }, { "epoch": 1.14, "learning_rate": 4.119210133939085e-06, "logits/chosen": -1.7123361825942993, "logits/rejected": -1.6348731517791748, "logps/chosen": -31.273303985595703, "logps/rejected": -32.056251525878906, "loss": 0.3052, "rewards/accuracies": 1.0, "rewards/chosen": 3.704227924346924, "rewards/margins": 0.17724204063415527, "rewards/rejected": 3.5269858837127686, "step": 5148 }, { "epoch": 1.14, "learning_rate": 4.117445891078474e-06, "logits/chosen": -1.796550989151001, "logits/rejected": -1.793235182762146, "logps/chosen": -67.30615234375, "logps/rejected": -87.41033935546875, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 8.187675476074219, "rewards/margins": 2.4804534912109375, "rewards/rejected": 5.707221984863281, "step": 5149 }, { "epoch": 1.14, "learning_rate": 4.115681761623941e-06, "logits/chosen": -1.6958510875701904, "logits/rejected": -1.5907683372497559, "logps/chosen": -31.51380729675293, "logps/rejected": -14.749322891235352, "loss": 0.3621, "rewards/accuracies": 1.0, "rewards/chosen": 1.739323616027832, "rewards/margins": 0.5372229814529419, "rewards/rejected": 1.2021006345748901, "step": 5150 }, { "epoch": 1.14, "learning_rate": 4.113917745802179e-06, "logits/chosen": -2.029388189315796, "logits/rejected": -2.040471315383911, "logps/chosen": -108.69856262207031, "logps/rejected": -101.42575073242188, "loss": 0.1324, "rewards/accuracies": 1.0, "rewards/chosen": 8.033024787902832, "rewards/margins": 2.0294909477233887, "rewards/rejected": 6.003533840179443, "step": 5151 }, { "epoch": 1.14, "learning_rate": 4.112153843839855e-06, "logits/chosen": -1.788041591644287, "logits/rejected": -1.8465214967727661, "logps/chosen": -23.540266036987305, "logps/rejected": -66.85527038574219, "loss": 0.5165, "rewards/accuracies": 0.0, "rewards/chosen": 3.0145349502563477, "rewards/margins": -0.48618149757385254, "rewards/rejected": 3.5007164478302, "step": 5152 }, { "epoch": 1.14, "learning_rate": 4.1103900559636276e-06, "logits/chosen": -1.6930211782455444, "logits/rejected": -1.684517741203308, "logps/chosen": -48.173240661621094, "logps/rejected": -41.62286376953125, "loss": 0.8067, "rewards/accuracies": 0.0, "rewards/chosen": 4.066326141357422, "rewards/margins": -1.3865852355957031, "rewards/rejected": 5.452911376953125, "step": 5153 }, { "epoch": 1.14, "learning_rate": 4.108626382400142e-06, "logits/chosen": -2.1852753162384033, "logits/rejected": -2.1420722007751465, "logps/chosen": -51.56279373168945, "logps/rejected": -30.274879455566406, "loss": 0.136, "rewards/accuracies": 1.0, "rewards/chosen": 3.3712291717529297, "rewards/margins": 2.228691577911377, "rewards/rejected": 1.1425377130508423, "step": 5154 }, { "epoch": 1.14, "learning_rate": 4.106862823376021e-06, "logits/chosen": -1.7338554859161377, "logits/rejected": -1.6310477256774902, "logps/chosen": -93.61569213867188, "logps/rejected": -68.98652648925781, "loss": 0.3905, "rewards/accuracies": 0.0, "rewards/chosen": 4.5994553565979, "rewards/margins": -0.1465315818786621, "rewards/rejected": 4.7459869384765625, "step": 5155 }, { "epoch": 1.14, "learning_rate": 4.105099379117881e-06, "logits/chosen": -1.7955518960952759, "logits/rejected": -1.8006641864776611, "logps/chosen": -42.091041564941406, "logps/rejected": -70.59292602539062, "loss": 3.4174, "rewards/accuracies": 0.0, "rewards/chosen": 3.7839715480804443, "rewards/margins": -0.5577270984649658, "rewards/rejected": 4.34169864654541, "step": 5156 }, { "epoch": 1.14, "learning_rate": 4.1033360498523175e-06, "logits/chosen": -2.0097248554229736, "logits/rejected": -1.9997092485427856, "logps/chosen": -61.67069625854492, "logps/rejected": -44.89778137207031, "loss": 0.1971, "rewards/accuracies": 1.0, "rewards/chosen": 4.01988410949707, "rewards/margins": 0.8950693607330322, "rewards/rejected": 3.124814748764038, "step": 5157 }, { "epoch": 1.14, "learning_rate": 4.1015728358059185e-06, "logits/chosen": -1.8872734308242798, "logits/rejected": -1.8622758388519287, "logps/chosen": -38.08896255493164, "logps/rejected": -174.97952270507812, "loss": 0.2717, "rewards/accuracies": 1.0, "rewards/chosen": 8.4970121383667, "rewards/margins": 0.3485870361328125, "rewards/rejected": 8.148425102233887, "step": 5158 }, { "epoch": 1.14, "learning_rate": 4.099809737205249e-06, "logits/chosen": -2.132039785385132, "logits/rejected": -2.134335994720459, "logps/chosen": -53.26183319091797, "logps/rejected": -69.06707000732422, "loss": 0.5008, "rewards/accuracies": 0.0, "rewards/chosen": 6.49280309677124, "rewards/margins": -0.4489612579345703, "rewards/rejected": 6.9417643547058105, "step": 5159 }, { "epoch": 1.14, "learning_rate": 4.098046754276865e-06, "logits/chosen": -1.9836229085922241, "logits/rejected": -2.003822088241577, "logps/chosen": -32.54609680175781, "logps/rejected": -44.366981506347656, "loss": 0.6829, "rewards/accuracies": 0.0, "rewards/chosen": 3.4664466381073, "rewards/margins": -1.0709426403045654, "rewards/rejected": 4.537389278411865, "step": 5160 }, { "epoch": 1.14, "learning_rate": 4.0962838872473046e-06, "logits/chosen": -2.147294282913208, "logits/rejected": -2.1485366821289062, "logps/chosen": -60.79861068725586, "logps/rejected": -80.88685607910156, "loss": 0.1197, "rewards/accuracies": 1.0, "rewards/chosen": 4.927115440368652, "rewards/margins": 1.7269291877746582, "rewards/rejected": 3.200186252593994, "step": 5161 }, { "epoch": 1.14, "learning_rate": 4.094521136343092e-06, "logits/chosen": -1.7555164098739624, "logits/rejected": -1.7256271839141846, "logps/chosen": -49.652870178222656, "logps/rejected": -65.09868621826172, "loss": 0.2897, "rewards/accuracies": 1.0, "rewards/chosen": 4.9600348472595215, "rewards/margins": 0.41095781326293945, "rewards/rejected": 4.549077033996582, "step": 5162 }, { "epoch": 1.14, "learning_rate": 4.092758501790737e-06, "logits/chosen": -2.1655032634735107, "logits/rejected": -2.165461778640747, "logps/chosen": -88.21360778808594, "logps/rejected": -131.54534912109375, "loss": 0.1641, "rewards/accuracies": 1.0, "rewards/chosen": 8.545954704284668, "rewards/margins": 2.2434139251708984, "rewards/rejected": 6.3025407791137695, "step": 5163 }, { "epoch": 1.14, "learning_rate": 4.090995983816734e-06, "logits/chosen": -2.170332431793213, "logits/rejected": -2.172564744949341, "logps/chosen": -45.490386962890625, "logps/rejected": -64.54300689697266, "loss": 0.344, "rewards/accuracies": 1.0, "rewards/chosen": 4.013470649719238, "rewards/margins": 0.013916254043579102, "rewards/rejected": 3.999554395675659, "step": 5164 }, { "epoch": 1.14, "learning_rate": 4.089233582647562e-06, "logits/chosen": -1.7477906942367554, "logits/rejected": -1.7527329921722412, "logps/chosen": -15.302509307861328, "logps/rejected": -63.95616149902344, "loss": 0.5429, "rewards/accuracies": 1.0, "rewards/chosen": 3.1316230297088623, "rewards/margins": 1.6740103960037231, "rewards/rejected": 1.4576126337051392, "step": 5165 }, { "epoch": 1.14, "learning_rate": 4.0874712985096836e-06, "logits/chosen": -2.191704511642456, "logits/rejected": -2.125676155090332, "logps/chosen": -54.50541687011719, "logps/rejected": -39.69371795654297, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 4.213436126708984, "rewards/margins": 3.264662981033325, "rewards/rejected": 0.948773205280304, "step": 5166 }, { "epoch": 1.14, "learning_rate": 4.085709131629552e-06, "logits/chosen": -1.7931132316589355, "logits/rejected": -1.7401618957519531, "logps/chosen": -67.52275085449219, "logps/rejected": -42.363792419433594, "loss": 0.5242, "rewards/accuracies": 1.0, "rewards/chosen": 2.838449239730835, "rewards/margins": 0.08899784088134766, "rewards/rejected": 2.7494513988494873, "step": 5167 }, { "epoch": 1.14, "learning_rate": 4.083947082233596e-06, "logits/chosen": -1.8940869569778442, "logits/rejected": -1.8608918190002441, "logps/chosen": -28.438447952270508, "logps/rejected": -39.15653991699219, "loss": 0.3528, "rewards/accuracies": 1.0, "rewards/chosen": 4.108897686004639, "rewards/margins": 0.201735258102417, "rewards/rejected": 3.9071624279022217, "step": 5168 }, { "epoch": 1.14, "learning_rate": 4.0821851505482406e-06, "logits/chosen": -1.7635084390640259, "logits/rejected": -1.6475030183792114, "logps/chosen": -59.283050537109375, "logps/rejected": -19.665782928466797, "loss": 1.0978, "rewards/accuracies": 1.0, "rewards/chosen": 9.212549209594727, "rewards/margins": 7.927942752838135, "rewards/rejected": 1.2846065759658813, "step": 5169 }, { "epoch": 1.14, "learning_rate": 4.080423336799885e-06, "logits/chosen": -1.9178279638290405, "logits/rejected": -1.9505178928375244, "logps/chosen": -69.16836547851562, "logps/rejected": -162.0563507080078, "loss": 0.6076, "rewards/accuracies": 0.0, "rewards/chosen": 8.371787071228027, "rewards/margins": -0.6419639587402344, "rewards/rejected": 9.013751029968262, "step": 5170 }, { "epoch": 1.14, "learning_rate": 4.0786616412149224e-06, "logits/chosen": -2.0055577754974365, "logits/rejected": -2.0021450519561768, "logps/chosen": -42.078826904296875, "logps/rejected": -56.245662689208984, "loss": 0.2109, "rewards/accuracies": 1.0, "rewards/chosen": 2.9183433055877686, "rewards/margins": 0.7592654228210449, "rewards/rejected": 2.1590778827667236, "step": 5171 }, { "epoch": 1.14, "learning_rate": 4.076900064019721e-06, "logits/chosen": -1.9360243082046509, "logits/rejected": -2.017751693725586, "logps/chosen": -63.75600814819336, "logps/rejected": -119.24366760253906, "loss": 3.0642, "rewards/accuracies": 0.0, "rewards/chosen": 7.0258636474609375, "rewards/margins": -6.022784233093262, "rewards/rejected": 13.0486478805542, "step": 5172 }, { "epoch": 1.14, "learning_rate": 4.075138605440645e-06, "logits/chosen": -1.5910556316375732, "logits/rejected": -1.526985764503479, "logps/chosen": -55.13774490356445, "logps/rejected": -28.939678192138672, "loss": 0.3333, "rewards/accuracies": 1.0, "rewards/chosen": 2.6835103034973145, "rewards/margins": 0.8175442218780518, "rewards/rejected": 1.8659660816192627, "step": 5173 }, { "epoch": 1.15, "learning_rate": 4.073377265704035e-06, "logits/chosen": -1.8723464012145996, "logits/rejected": -1.8723464012145996, "logps/chosen": -41.87387466430664, "logps/rejected": -41.87387466430664, "loss": 0.367, "rewards/accuracies": 0.0, "rewards/chosen": 5.067880630493164, "rewards/margins": 0.0, "rewards/rejected": 5.067880630493164, "step": 5174 }, { "epoch": 1.15, "learning_rate": 4.0716160450362155e-06, "logits/chosen": -1.8763338327407837, "logits/rejected": -1.772200107574463, "logps/chosen": -44.56678771972656, "logps/rejected": -16.708332061767578, "loss": 0.0982, "rewards/accuracies": 1.0, "rewards/chosen": 2.1586899757385254, "rewards/margins": 1.6065282821655273, "rewards/rejected": 0.5521616339683533, "step": 5175 }, { "epoch": 1.15, "learning_rate": 4.069854943663506e-06, "logits/chosen": -1.9623935222625732, "logits/rejected": -1.9252243041992188, "logps/chosen": -40.81896209716797, "logps/rejected": -33.19369888305664, "loss": 0.4635, "rewards/accuracies": 1.0, "rewards/chosen": 3.4440314769744873, "rewards/margins": 1.3340137004852295, "rewards/rejected": 2.110017776489258, "step": 5176 }, { "epoch": 1.15, "learning_rate": 4.0680939618121975e-06, "logits/chosen": -1.788421630859375, "logits/rejected": -1.8293708562850952, "logps/chosen": -35.8045768737793, "logps/rejected": -80.83663940429688, "loss": 0.6564, "rewards/accuracies": 0.0, "rewards/chosen": 3.470512866973877, "rewards/margins": -0.799504280090332, "rewards/rejected": 4.270017147064209, "step": 5177 }, { "epoch": 1.15, "learning_rate": 4.066333099708577e-06, "logits/chosen": -1.7220683097839355, "logits/rejected": -1.7130519151687622, "logps/chosen": -50.38813018798828, "logps/rejected": -51.737274169921875, "loss": 0.5957, "rewards/accuracies": 0.0, "rewards/chosen": 2.6694633960723877, "rewards/margins": -0.7994644641876221, "rewards/rejected": 3.4689278602600098, "step": 5178 }, { "epoch": 1.15, "learning_rate": 4.064572357578908e-06, "logits/chosen": -1.9259026050567627, "logits/rejected": -1.7652721405029297, "logps/chosen": -80.07231140136719, "logps/rejected": -26.579837799072266, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": 3.813305616378784, "rewards/margins": 2.8827266693115234, "rewards/rejected": 0.9305790066719055, "step": 5179 }, { "epoch": 1.15, "learning_rate": 4.062811735649442e-06, "logits/chosen": -2.03043794631958, "logits/rejected": -1.9886854887008667, "logps/chosen": -95.61195373535156, "logps/rejected": -73.37126159667969, "loss": 0.0912, "rewards/accuracies": 1.0, "rewards/chosen": 5.9067888259887695, "rewards/margins": 1.8817949295043945, "rewards/rejected": 4.024993896484375, "step": 5180 }, { "epoch": 1.15, "learning_rate": 4.061051234146414e-06, "logits/chosen": -2.1160354614257812, "logits/rejected": -2.1160354614257812, "logps/chosen": -47.58934020996094, "logps/rejected": -47.58934020996094, "loss": 0.4381, "rewards/accuracies": 0.0, "rewards/chosen": 4.689568519592285, "rewards/margins": 0.0, "rewards/rejected": 4.689568519592285, "step": 5181 }, { "epoch": 1.15, "learning_rate": 4.059290853296047e-06, "logits/chosen": -1.9569607973098755, "logits/rejected": -1.9569607973098755, "logps/chosen": -46.00102996826172, "logps/rejected": -46.00102996826172, "loss": 0.721, "rewards/accuracies": 0.0, "rewards/chosen": 6.863572120666504, "rewards/margins": 0.0, "rewards/rejected": 6.863572120666504, "step": 5182 }, { "epoch": 1.15, "learning_rate": 4.057530593324542e-06, "logits/chosen": -1.635414958000183, "logits/rejected": -1.6350449323654175, "logps/chosen": -24.532438278198242, "logps/rejected": -54.816925048828125, "loss": 1.4457, "rewards/accuracies": 0.0, "rewards/chosen": 3.5145652294158936, "rewards/margins": -2.33215594291687, "rewards/rejected": 5.846721172332764, "step": 5183 }, { "epoch": 1.15, "learning_rate": 4.055770454458092e-06, "logits/chosen": -1.8031266927719116, "logits/rejected": -1.699708342552185, "logps/chosen": -40.34911346435547, "logps/rejected": -60.099552154541016, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 4.082578182220459, "rewards/margins": 3.3320696353912354, "rewards/rejected": 0.7505084872245789, "step": 5184 }, { "epoch": 1.15, "learning_rate": 4.054010436922869e-06, "logits/chosen": -1.874605417251587, "logits/rejected": -1.7960102558135986, "logps/chosen": -50.727142333984375, "logps/rejected": -54.57684326171875, "loss": 0.1575, "rewards/accuracies": 1.0, "rewards/chosen": 3.5713250637054443, "rewards/margins": 1.540132999420166, "rewards/rejected": 2.0311920642852783, "step": 5185 }, { "epoch": 1.15, "learning_rate": 4.052250540945029e-06, "logits/chosen": -2.175032377243042, "logits/rejected": -2.169687032699585, "logps/chosen": -74.28738403320312, "logps/rejected": -96.95313262939453, "loss": 0.4906, "rewards/accuracies": 0.0, "rewards/chosen": 8.426886558532715, "rewards/margins": -0.45044422149658203, "rewards/rejected": 8.877330780029297, "step": 5186 }, { "epoch": 1.15, "learning_rate": 4.050490766750718e-06, "logits/chosen": -2.038644313812256, "logits/rejected": -1.9495686292648315, "logps/chosen": -57.33425521850586, "logps/rejected": -11.25611400604248, "loss": 0.7367, "rewards/accuracies": 1.0, "rewards/chosen": 4.7481818199157715, "rewards/margins": 3.8762478828430176, "rewards/rejected": 0.8719340562820435, "step": 5187 }, { "epoch": 1.15, "learning_rate": 4.048731114566059e-06, "logits/chosen": -1.9900022745132446, "logits/rejected": -1.9712187051773071, "logps/chosen": -30.802955627441406, "logps/rejected": -52.7889404296875, "loss": 1.0441, "rewards/accuracies": 1.0, "rewards/chosen": 3.413933515548706, "rewards/margins": 0.32584452629089355, "rewards/rejected": 3.0880889892578125, "step": 5188 }, { "epoch": 1.15, "learning_rate": 4.0469715846171675e-06, "logits/chosen": -2.0652780532836914, "logits/rejected": -2.001574754714966, "logps/chosen": -73.95947265625, "logps/rejected": -75.25465393066406, "loss": 0.1949, "rewards/accuracies": 1.0, "rewards/chosen": 4.085538387298584, "rewards/margins": 0.793027400970459, "rewards/rejected": 3.292510986328125, "step": 5189 }, { "epoch": 1.15, "learning_rate": 4.045212177130133e-06, "logits/chosen": -1.9648257493972778, "logits/rejected": -1.9745419025421143, "logps/chosen": -86.26325988769531, "logps/rejected": -120.45028686523438, "loss": 0.1126, "rewards/accuracies": 1.0, "rewards/chosen": 10.401564598083496, "rewards/margins": 1.380401611328125, "rewards/rejected": 9.021162986755371, "step": 5190 }, { "epoch": 1.15, "learning_rate": 4.043452892331041e-06, "logits/chosen": -1.8043127059936523, "logits/rejected": -1.8120182752609253, "logps/chosen": -32.930179595947266, "logps/rejected": -86.93455505371094, "loss": 0.5106, "rewards/accuracies": 1.0, "rewards/chosen": 3.2878079414367676, "rewards/margins": 0.03298544883728027, "rewards/rejected": 3.2548224925994873, "step": 5191 }, { "epoch": 1.15, "learning_rate": 4.0416937304459515e-06, "logits/chosen": -2.073723554611206, "logits/rejected": -2.0965802669525146, "logps/chosen": -46.825592041015625, "logps/rejected": -117.79133605957031, "loss": 0.5218, "rewards/accuracies": 1.0, "rewards/chosen": 4.888066291809082, "rewards/margins": 1.7901062965393066, "rewards/rejected": 3.0979599952697754, "step": 5192 }, { "epoch": 1.15, "learning_rate": 4.039934691700915e-06, "logits/chosen": -1.983289361000061, "logits/rejected": -1.9265888929367065, "logps/chosen": -46.51679992675781, "logps/rejected": -70.47648620605469, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": 2.568469285964966, "rewards/margins": 2.1669769287109375, "rewards/rejected": 0.40149232745170593, "step": 5193 }, { "epoch": 1.15, "learning_rate": 4.038175776321962e-06, "logits/chosen": -2.030346632003784, "logits/rejected": -2.0192573070526123, "logps/chosen": -55.64356994628906, "logps/rejected": -58.68705749511719, "loss": 0.1884, "rewards/accuracies": 1.0, "rewards/chosen": 3.4416840076446533, "rewards/margins": 0.7887420654296875, "rewards/rejected": 2.652941942214966, "step": 5194 }, { "epoch": 1.15, "learning_rate": 4.0364169845351095e-06, "logits/chosen": -1.59506356716156, "logits/rejected": -1.564420223236084, "logps/chosen": -30.9570255279541, "logps/rejected": -38.522762298583984, "loss": 0.5512, "rewards/accuracies": 1.0, "rewards/chosen": 2.564216375350952, "rewards/margins": 1.530038595199585, "rewards/rejected": 1.0341777801513672, "step": 5195 }, { "epoch": 1.15, "learning_rate": 4.03465831656636e-06, "logits/chosen": -1.7091763019561768, "logits/rejected": -1.4392216205596924, "logps/chosen": -64.45277404785156, "logps/rejected": -49.59549331665039, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": 3.3937737941741943, "rewards/margins": 1.9769986867904663, "rewards/rejected": 1.416775107383728, "step": 5196 }, { "epoch": 1.15, "learning_rate": 4.032899772641694e-06, "logits/chosen": -1.9143767356872559, "logits/rejected": -1.923559546470642, "logps/chosen": -56.00668716430664, "logps/rejected": -124.50326538085938, "loss": 0.1505, "rewards/accuracies": 1.0, "rewards/chosen": 7.877172946929932, "rewards/margins": 1.3350005149841309, "rewards/rejected": 6.542172431945801, "step": 5197 }, { "epoch": 1.15, "learning_rate": 4.031141352987084e-06, "logits/chosen": -1.9080451726913452, "logits/rejected": -1.803697943687439, "logps/chosen": -61.099483489990234, "logps/rejected": -16.186534881591797, "loss": 0.1584, "rewards/accuracies": 1.0, "rewards/chosen": 1.7253605127334595, "rewards/margins": 1.1578764915466309, "rewards/rejected": 0.5674840807914734, "step": 5198 }, { "epoch": 1.15, "learning_rate": 4.0293830578284784e-06, "logits/chosen": -1.9975398778915405, "logits/rejected": -1.8370708227157593, "logps/chosen": -161.75323486328125, "logps/rejected": -60.322593688964844, "loss": 0.2952, "rewards/accuracies": 1.0, "rewards/chosen": 6.07893705368042, "rewards/margins": 3.0934152603149414, "rewards/rejected": 2.9855217933654785, "step": 5199 }, { "epoch": 1.15, "learning_rate": 4.02762488739182e-06, "logits/chosen": -1.6790729761123657, "logits/rejected": -1.4244743585586548, "logps/chosen": -12.325338363647461, "logps/rejected": -58.826019287109375, "loss": 2.0832, "rewards/accuracies": 0.0, "rewards/chosen": 0.8552915453910828, "rewards/margins": -4.137564659118652, "rewards/rejected": 4.992856025695801, "step": 5200 }, { "epoch": 1.15, "learning_rate": 4.025866841903023e-06, "logits/chosen": -1.981764793395996, "logits/rejected": -1.9508386850357056, "logps/chosen": -38.54934310913086, "logps/rejected": -47.4376106262207, "loss": 0.4698, "rewards/accuracies": 1.0, "rewards/chosen": 4.5224738121032715, "rewards/margins": 2.3491244316101074, "rewards/rejected": 2.173349380493164, "step": 5201 }, { "epoch": 1.15, "learning_rate": 4.0241089215879965e-06, "logits/chosen": -2.141787052154541, "logits/rejected": -2.1194214820861816, "logps/chosen": -52.363773345947266, "logps/rejected": -56.07544708251953, "loss": 0.2571, "rewards/accuracies": 1.0, "rewards/chosen": 3.897716999053955, "rewards/margins": 1.2420399188995361, "rewards/rejected": 2.655677080154419, "step": 5202 }, { "epoch": 1.15, "learning_rate": 4.022351126672628e-06, "logits/chosen": -1.7440388202667236, "logits/rejected": -1.8667510747909546, "logps/chosen": -26.23790168762207, "logps/rejected": -106.8030014038086, "loss": 2.9568, "rewards/accuracies": 0.0, "rewards/chosen": 3.269456624984741, "rewards/margins": -5.8752641677856445, "rewards/rejected": 9.144721031188965, "step": 5203 }, { "epoch": 1.15, "learning_rate": 4.02059345738279e-06, "logits/chosen": -1.7567181587219238, "logits/rejected": -1.6465171575546265, "logps/chosen": -78.4873275756836, "logps/rejected": -152.43344116210938, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": 9.294429779052734, "rewards/margins": 2.2446541786193848, "rewards/rejected": 7.04977560043335, "step": 5204 }, { "epoch": 1.15, "learning_rate": 4.018835913944336e-06, "logits/chosen": -1.8848999738693237, "logits/rejected": -1.7218332290649414, "logps/chosen": -39.86976623535156, "logps/rejected": -44.660980224609375, "loss": 0.0715, "rewards/accuracies": 1.0, "rewards/chosen": 4.03418493270874, "rewards/margins": 2.1725716590881348, "rewards/rejected": 1.861613154411316, "step": 5205 }, { "epoch": 1.15, "learning_rate": 4.01707849658311e-06, "logits/chosen": -1.7580993175506592, "logits/rejected": -1.7580993175506592, "logps/chosen": -60.71821975708008, "logps/rejected": -60.71821975708008, "loss": 0.3482, "rewards/accuracies": 0.0, "rewards/chosen": 5.361769676208496, "rewards/margins": 0.0, "rewards/rejected": 5.361769676208496, "step": 5206 }, { "epoch": 1.15, "learning_rate": 4.015321205524935e-06, "logits/chosen": -1.6328319311141968, "logits/rejected": -1.5688848495483398, "logps/chosen": -34.89689254760742, "logps/rejected": -48.0881233215332, "loss": 0.2337, "rewards/accuracies": 1.0, "rewards/chosen": 4.065150737762451, "rewards/margins": 0.5184004306793213, "rewards/rejected": 3.54675030708313, "step": 5207 }, { "epoch": 1.15, "learning_rate": 4.013564040995615e-06, "logits/chosen": -1.8848766088485718, "logits/rejected": -1.90324068069458, "logps/chosen": -59.52964782714844, "logps/rejected": -46.650962829589844, "loss": 0.4486, "rewards/accuracies": 0.0, "rewards/chosen": 3.1606216430664062, "rewards/margins": -0.03220367431640625, "rewards/rejected": 3.1928253173828125, "step": 5208 }, { "epoch": 1.15, "learning_rate": 4.011807003220948e-06, "logits/chosen": -1.8052711486816406, "logits/rejected": -1.3855626583099365, "logps/chosen": -42.53300476074219, "logps/rejected": -54.184478759765625, "loss": 0.6486, "rewards/accuracies": 0.0, "rewards/chosen": 3.8901422023773193, "rewards/margins": -0.31676459312438965, "rewards/rejected": 4.206906795501709, "step": 5209 }, { "epoch": 1.15, "learning_rate": 4.010050092426703e-06, "logits/chosen": -1.8728151321411133, "logits/rejected": -1.751266360282898, "logps/chosen": -200.67324829101562, "logps/rejected": -48.735206604003906, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": 7.673404216766357, "rewards/margins": 2.3905696868896484, "rewards/rejected": 5.282834529876709, "step": 5210 }, { "epoch": 1.15, "learning_rate": 4.008293308838643e-06, "logits/chosen": -2.0432817935943604, "logits/rejected": -2.046757459640503, "logps/chosen": -22.628408432006836, "logps/rejected": -14.977483749389648, "loss": 0.4497, "rewards/accuracies": 1.0, "rewards/chosen": 1.3240693807601929, "rewards/margins": 0.9476662874221802, "rewards/rejected": 0.3764030635356903, "step": 5211 }, { "epoch": 1.15, "learning_rate": 4.006536652682508e-06, "logits/chosen": -2.144482135772705, "logits/rejected": -2.1090571880340576, "logps/chosen": -86.30647277832031, "logps/rejected": -69.78706359863281, "loss": 0.1034, "rewards/accuracies": 1.0, "rewards/chosen": 6.168339729309082, "rewards/margins": 1.471388339996338, "rewards/rejected": 4.696951389312744, "step": 5212 }, { "epoch": 1.15, "learning_rate": 4.004780124184026e-06, "logits/chosen": -1.939439296722412, "logits/rejected": -1.939439296722412, "logps/chosen": -57.90837097167969, "logps/rejected": -57.90837097167969, "loss": 0.4573, "rewards/accuracies": 0.0, "rewards/chosen": 9.001686096191406, "rewards/margins": 0.0, "rewards/rejected": 9.001686096191406, "step": 5213 }, { "epoch": 1.15, "learning_rate": 4.003023723568903e-06, "logits/chosen": -1.6818692684173584, "logits/rejected": -1.6836795806884766, "logps/chosen": -35.16141891479492, "logps/rejected": -38.84165573120117, "loss": 0.1589, "rewards/accuracies": 1.0, "rewards/chosen": 2.675976276397705, "rewards/margins": 1.167012095451355, "rewards/rejected": 1.50896418094635, "step": 5214 }, { "epoch": 1.15, "learning_rate": 4.001267451062837e-06, "logits/chosen": -1.7619951963424683, "logits/rejected": -1.7748525142669678, "logps/chosen": -34.515342712402344, "logps/rejected": -47.450408935546875, "loss": 0.3598, "rewards/accuracies": 0.0, "rewards/chosen": 3.247913360595703, "rewards/margins": -0.045371294021606445, "rewards/rejected": 3.2932846546173096, "step": 5215 }, { "epoch": 1.15, "learning_rate": 3.999511306891503e-06, "logits/chosen": -1.9413093328475952, "logits/rejected": -1.8738337755203247, "logps/chosen": -71.24463653564453, "logps/rejected": -53.89588928222656, "loss": 0.3003, "rewards/accuracies": 1.0, "rewards/chosen": 5.147732734680176, "rewards/margins": 1.8609819412231445, "rewards/rejected": 3.2867507934570312, "step": 5216 }, { "epoch": 1.15, "learning_rate": 3.997755291280559e-06, "logits/chosen": -1.7696259021759033, "logits/rejected": -1.7535357475280762, "logps/chosen": -57.08626937866211, "logps/rejected": -34.06521224975586, "loss": 0.4279, "rewards/accuracies": 0.0, "rewards/chosen": 3.711874008178711, "rewards/margins": -0.26118922233581543, "rewards/rejected": 3.9730632305145264, "step": 5217 }, { "epoch": 1.15, "learning_rate": 3.995999404455652e-06, "logits/chosen": -2.156777858734131, "logits/rejected": -2.1695401668548584, "logps/chosen": -43.194419860839844, "logps/rejected": -52.208580017089844, "loss": 0.7098, "rewards/accuracies": 0.0, "rewards/chosen": 3.9600167274475098, "rewards/margins": -1.1390275955200195, "rewards/rejected": 5.099044322967529, "step": 5218 }, { "epoch": 1.16, "learning_rate": 3.9942436466424075e-06, "logits/chosen": -2.1507880687713623, "logits/rejected": -2.174283266067505, "logps/chosen": -56.887939453125, "logps/rejected": -78.38668823242188, "loss": 0.4163, "rewards/accuracies": 1.0, "rewards/chosen": 4.382081508636475, "rewards/margins": 0.1369786262512207, "rewards/rejected": 4.245102882385254, "step": 5219 }, { "epoch": 1.16, "learning_rate": 3.992488018066438e-06, "logits/chosen": -1.937394142150879, "logits/rejected": -1.8240493535995483, "logps/chosen": -64.56828308105469, "logps/rejected": -69.15687561035156, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": 3.488964796066284, "rewards/margins": 4.694468975067139, "rewards/rejected": -1.205504298210144, "step": 5220 }, { "epoch": 1.16, "learning_rate": 3.990732518953335e-06, "logits/chosen": -1.8130362033843994, "logits/rejected": -1.8130362033843994, "logps/chosen": -23.12020492553711, "logps/rejected": -23.12020492553711, "loss": 0.4579, "rewards/accuracies": 0.0, "rewards/chosen": 3.4202427864074707, "rewards/margins": 0.0, "rewards/rejected": 3.4202427864074707, "step": 5221 }, { "epoch": 1.16, "learning_rate": 3.988977149528678e-06, "logits/chosen": -2.1249196529388428, "logits/rejected": -2.041543483734131, "logps/chosen": -114.56072998046875, "logps/rejected": -43.657222747802734, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 6.844729900360107, "rewards/margins": 4.7577924728393555, "rewards/rejected": 2.086937427520752, "step": 5222 }, { "epoch": 1.16, "learning_rate": 3.987221910018025e-06, "logits/chosen": -1.8816537857055664, "logits/rejected": -1.8816537857055664, "logps/chosen": -46.11048126220703, "logps/rejected": -46.11048126220703, "loss": 0.403, "rewards/accuracies": 0.0, "rewards/chosen": 5.130675792694092, "rewards/margins": 0.0, "rewards/rejected": 5.130675792694092, "step": 5223 }, { "epoch": 1.16, "learning_rate": 3.985466800646923e-06, "logits/chosen": -1.9487836360931396, "logits/rejected": -1.9731675386428833, "logps/chosen": -47.46876525878906, "logps/rejected": -40.74974822998047, "loss": 0.4177, "rewards/accuracies": 0.0, "rewards/chosen": 3.1231095790863037, "rewards/margins": -0.1792609691619873, "rewards/rejected": 3.302370548248291, "step": 5224 }, { "epoch": 1.16, "learning_rate": 3.983711821640899e-06, "logits/chosen": -1.9640074968338013, "logits/rejected": -1.8486272096633911, "logps/chosen": -70.3241195678711, "logps/rejected": -41.09283447265625, "loss": 0.1493, "rewards/accuracies": 1.0, "rewards/chosen": 7.324556827545166, "rewards/margins": 1.0583930015563965, "rewards/rejected": 6.2661638259887695, "step": 5225 }, { "epoch": 1.16, "learning_rate": 3.9819569732254605e-06, "logits/chosen": -2.1130247116088867, "logits/rejected": -2.0883986949920654, "logps/chosen": -70.77162170410156, "logps/rejected": -62.75938415527344, "loss": 0.0598, "rewards/accuracies": 1.0, "rewards/chosen": 6.3622727394104, "rewards/margins": 2.5606300830841064, "rewards/rejected": 3.801642656326294, "step": 5226 }, { "epoch": 1.16, "learning_rate": 3.980202255626106e-06, "logits/chosen": -1.9545221328735352, "logits/rejected": -1.81606924533844, "logps/chosen": -91.26031494140625, "logps/rejected": -43.60570526123047, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": 7.32217264175415, "rewards/margins": 4.5276665687561035, "rewards/rejected": 2.794506072998047, "step": 5227 }, { "epoch": 1.16, "learning_rate": 3.978447669068309e-06, "logits/chosen": -1.9155038595199585, "logits/rejected": -1.7887136936187744, "logps/chosen": -47.631595611572266, "logps/rejected": -57.12807083129883, "loss": 0.1881, "rewards/accuracies": 1.0, "rewards/chosen": 4.231271266937256, "rewards/margins": 0.8419268131256104, "rewards/rejected": 3.3893444538116455, "step": 5228 }, { "epoch": 1.16, "learning_rate": 3.976693213777532e-06, "logits/chosen": -2.0318336486816406, "logits/rejected": -1.8598719835281372, "logps/chosen": -134.6199493408203, "logps/rejected": -32.28414535522461, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 8.358065605163574, "rewards/margins": 7.835810661315918, "rewards/rejected": 0.5222549438476562, "step": 5229 }, { "epoch": 1.16, "learning_rate": 3.974938889979216e-06, "logits/chosen": -2.006570339202881, "logits/rejected": -1.9227393865585327, "logps/chosen": -76.9854736328125, "logps/rejected": -32.43632125854492, "loss": 0.2542, "rewards/accuracies": 1.0, "rewards/chosen": 3.5182595252990723, "rewards/margins": 1.8547260761260986, "rewards/rejected": 1.6635334491729736, "step": 5230 }, { "epoch": 1.16, "learning_rate": 3.973184697898789e-06, "logits/chosen": -1.841423749923706, "logits/rejected": -1.841423749923706, "logps/chosen": -13.129373550415039, "logps/rejected": -13.129373550415039, "loss": 0.409, "rewards/accuracies": 0.0, "rewards/chosen": 1.7242611646652222, "rewards/margins": 0.0, "rewards/rejected": 1.7242611646652222, "step": 5231 }, { "epoch": 1.16, "learning_rate": 3.971430637761659e-06, "logits/chosen": -1.9651328325271606, "logits/rejected": -1.9897379875183105, "logps/chosen": -39.64231872558594, "logps/rejected": -70.76025390625, "loss": 1.7516, "rewards/accuracies": 0.0, "rewards/chosen": 3.680468797683716, "rewards/margins": -3.309509515762329, "rewards/rejected": 6.989978313446045, "step": 5232 }, { "epoch": 1.16, "learning_rate": 3.9696767097932205e-06, "logits/chosen": -1.8507418632507324, "logits/rejected": -1.8590478897094727, "logps/chosen": -65.01831817626953, "logps/rejected": -97.48512268066406, "loss": 0.1238, "rewards/accuracies": 1.0, "rewards/chosen": 3.5452020168304443, "rewards/margins": 1.581649899482727, "rewards/rejected": 1.9635521173477173, "step": 5233 }, { "epoch": 1.16, "learning_rate": 3.967922914218846e-06, "logits/chosen": -1.7128105163574219, "logits/rejected": -1.6747567653656006, "logps/chosen": -43.61846923828125, "logps/rejected": -53.57814407348633, "loss": 0.1822, "rewards/accuracies": 1.0, "rewards/chosen": 3.2242355346679688, "rewards/margins": 1.1603686809539795, "rewards/rejected": 2.0638668537139893, "step": 5234 }, { "epoch": 1.16, "learning_rate": 3.966169251263898e-06, "logits/chosen": -2.305974245071411, "logits/rejected": -2.300074815750122, "logps/chosen": -31.15985870361328, "logps/rejected": -76.42457580566406, "loss": 1.8636, "rewards/accuracies": 0.0, "rewards/chosen": 5.799762725830078, "rewards/margins": -3.563861846923828, "rewards/rejected": 9.363624572753906, "step": 5235 }, { "epoch": 1.16, "learning_rate": 3.9644157211537164e-06, "logits/chosen": -1.9983247518539429, "logits/rejected": -1.9444544315338135, "logps/chosen": -112.74562072753906, "logps/rejected": -31.335899353027344, "loss": 0.0764, "rewards/accuracies": 1.0, "rewards/chosen": 6.264279365539551, "rewards/margins": 3.307058095932007, "rewards/rejected": 2.957221269607544, "step": 5236 }, { "epoch": 1.16, "learning_rate": 3.962662324113623e-06, "logits/chosen": -2.179429054260254, "logits/rejected": -2.166450262069702, "logps/chosen": -75.9837417602539, "logps/rejected": -44.008365631103516, "loss": 0.3731, "rewards/accuracies": 1.0, "rewards/chosen": 3.7550408840179443, "rewards/margins": 1.0387799739837646, "rewards/rejected": 2.7162609100341797, "step": 5237 }, { "epoch": 1.16, "learning_rate": 3.960909060368929e-06, "logits/chosen": -1.8822005987167358, "logits/rejected": -1.8822005987167358, "logps/chosen": -42.666534423828125, "logps/rejected": -42.666534423828125, "loss": 0.3824, "rewards/accuracies": 0.0, "rewards/chosen": 2.2705705165863037, "rewards/margins": 0.0, "rewards/rejected": 2.2705705165863037, "step": 5238 }, { "epoch": 1.16, "learning_rate": 3.959155930144922e-06, "logits/chosen": -1.7471944093704224, "logits/rejected": -1.6885321140289307, "logps/chosen": -73.69186401367188, "logps/rejected": -66.66520690917969, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 6.368419170379639, "rewards/margins": 3.9568209648132324, "rewards/rejected": 2.4115982055664062, "step": 5239 }, { "epoch": 1.16, "learning_rate": 3.9574029336668775e-06, "logits/chosen": -1.9222586154937744, "logits/rejected": -1.9121516942977905, "logps/chosen": -41.3173942565918, "logps/rejected": -86.05905151367188, "loss": 0.183, "rewards/accuracies": 1.0, "rewards/chosen": 4.654550552368164, "rewards/margins": 1.2259409427642822, "rewards/rejected": 3.428609609603882, "step": 5240 }, { "epoch": 1.16, "learning_rate": 3.955650071160047e-06, "logits/chosen": -1.9247499704360962, "logits/rejected": -1.9242916107177734, "logps/chosen": -30.638587951660156, "logps/rejected": -53.86574935913086, "loss": 0.5889, "rewards/accuracies": 0.0, "rewards/chosen": 2.65765643119812, "rewards/margins": -0.3508751392364502, "rewards/rejected": 3.0085315704345703, "step": 5241 }, { "epoch": 1.16, "learning_rate": 3.953897342849673e-06, "logits/chosen": -1.8384795188903809, "logits/rejected": -1.8720076084136963, "logps/chosen": -40.942726135253906, "logps/rejected": -86.91414642333984, "loss": 0.3648, "rewards/accuracies": 1.0, "rewards/chosen": 3.2094292640686035, "rewards/margins": 1.8707748651504517, "rewards/rejected": 1.3386543989181519, "step": 5242 }, { "epoch": 1.16, "learning_rate": 3.952144748960976e-06, "logits/chosen": -2.141430139541626, "logits/rejected": -2.0942444801330566, "logps/chosen": -62.293060302734375, "logps/rejected": -47.27189254760742, "loss": 0.3462, "rewards/accuracies": 1.0, "rewards/chosen": 4.755070686340332, "rewards/margins": 0.004081249237060547, "rewards/rejected": 4.7509894371032715, "step": 5243 }, { "epoch": 1.16, "learning_rate": 3.9503922897191596e-06, "logits/chosen": -1.9514763355255127, "logits/rejected": -1.8553448915481567, "logps/chosen": -34.891910552978516, "logps/rejected": -16.885875701904297, "loss": 0.3677, "rewards/accuracies": 1.0, "rewards/chosen": 4.154541492462158, "rewards/margins": 1.0203568935394287, "rewards/rejected": 3.1341845989227295, "step": 5244 }, { "epoch": 1.16, "learning_rate": 3.948639965349411e-06, "logits/chosen": -1.8319283723831177, "logits/rejected": -1.8224445581436157, "logps/chosen": -53.264888763427734, "logps/rejected": -17.237485885620117, "loss": 0.4922, "rewards/accuracies": 0.0, "rewards/chosen": 4.131920337677002, "rewards/margins": -0.4519534111022949, "rewards/rejected": 4.583873748779297, "step": 5245 }, { "epoch": 1.16, "learning_rate": 3.9468877760768996e-06, "logits/chosen": -2.0807437896728516, "logits/rejected": -2.0855162143707275, "logps/chosen": -90.20036315917969, "logps/rejected": -68.10382080078125, "loss": 0.1913, "rewards/accuracies": 1.0, "rewards/chosen": 6.600965976715088, "rewards/margins": 1.9463319778442383, "rewards/rejected": 4.65463399887085, "step": 5246 }, { "epoch": 1.16, "learning_rate": 3.945135722126777e-06, "logits/chosen": -2.025740146636963, "logits/rejected": -2.0851194858551025, "logps/chosen": -43.04474639892578, "logps/rejected": -87.68159484863281, "loss": 1.7047, "rewards/accuracies": 0.0, "rewards/chosen": 4.190032482147217, "rewards/margins": -3.2277259826660156, "rewards/rejected": 7.417758464813232, "step": 5247 }, { "epoch": 1.16, "learning_rate": 3.943383803724178e-06, "logits/chosen": -1.7811466455459595, "logits/rejected": -1.7536509037017822, "logps/chosen": -55.779476165771484, "logps/rejected": -46.71958541870117, "loss": 0.2376, "rewards/accuracies": 1.0, "rewards/chosen": 3.825043201446533, "rewards/margins": 0.5698974132537842, "rewards/rejected": 3.255145788192749, "step": 5248 }, { "epoch": 1.16, "learning_rate": 3.941632021094221e-06, "logits/chosen": -2.113302707672119, "logits/rejected": -2.106311798095703, "logps/chosen": -47.52315139770508, "logps/rejected": -38.26431655883789, "loss": 1.4441, "rewards/accuracies": 0.0, "rewards/chosen": 3.20825457572937, "rewards/margins": -0.6676042079925537, "rewards/rejected": 3.875858783721924, "step": 5249 }, { "epoch": 1.16, "learning_rate": 3.939880374462003e-06, "logits/chosen": -2.0884463787078857, "logits/rejected": -2.0667781829833984, "logps/chosen": -38.36054992675781, "logps/rejected": -37.599700927734375, "loss": 0.253, "rewards/accuracies": 1.0, "rewards/chosen": 4.0017547607421875, "rewards/margins": 0.9289963245391846, "rewards/rejected": 3.072758436203003, "step": 5250 }, { "epoch": 1.16, "learning_rate": 3.938128864052611e-06, "logits/chosen": -2.069192409515381, "logits/rejected": -2.1000208854675293, "logps/chosen": -38.42982864379883, "logps/rejected": -83.47261810302734, "loss": 1.2993, "rewards/accuracies": 0.0, "rewards/chosen": 2.7698781490325928, "rewards/margins": -2.397505521774292, "rewards/rejected": 5.167383670806885, "step": 5251 }, { "epoch": 1.16, "learning_rate": 3.936377490091105e-06, "logits/chosen": -2.003079891204834, "logits/rejected": -2.003079891204834, "logps/chosen": -23.076194763183594, "logps/rejected": -23.076194763183594, "loss": 1.3443, "rewards/accuracies": 0.0, "rewards/chosen": 5.599312782287598, "rewards/margins": 0.0, "rewards/rejected": 5.599312782287598, "step": 5252 }, { "epoch": 1.16, "learning_rate": 3.934626252802536e-06, "logits/chosen": -1.9419801235198975, "logits/rejected": -1.9430211782455444, "logps/chosen": -46.94392013549805, "logps/rejected": -60.768798828125, "loss": 0.1692, "rewards/accuracies": 1.0, "rewards/chosen": 3.2594821453094482, "rewards/margins": 1.3966548442840576, "rewards/rejected": 1.8628273010253906, "step": 5253 }, { "epoch": 1.16, "learning_rate": 3.932875152411932e-06, "logits/chosen": -2.2595837116241455, "logits/rejected": -2.2447757720947266, "logps/chosen": -117.67489624023438, "logps/rejected": -80.44961547851562, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 12.050811767578125, "rewards/margins": 2.2120819091796875, "rewards/rejected": 9.838729858398438, "step": 5254 }, { "epoch": 1.16, "learning_rate": 3.931124189144307e-06, "logits/chosen": -1.9897562265396118, "logits/rejected": -1.9708707332611084, "logps/chosen": -107.8879623413086, "logps/rejected": -97.60630798339844, "loss": 0.1259, "rewards/accuracies": 1.0, "rewards/chosen": 9.035090446472168, "rewards/margins": 1.6619625091552734, "rewards/rejected": 7.3731279373168945, "step": 5255 }, { "epoch": 1.16, "learning_rate": 3.929373363224654e-06, "logits/chosen": -1.7797372341156006, "logits/rejected": -1.7778501510620117, "logps/chosen": -34.24763488769531, "logps/rejected": -40.45206832885742, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 2.7952053546905518, "rewards/margins": 0.061916351318359375, "rewards/rejected": 2.7332890033721924, "step": 5256 }, { "epoch": 1.16, "learning_rate": 3.927622674877948e-06, "logits/chosen": -1.9144625663757324, "logits/rejected": -1.987261176109314, "logps/chosen": -44.23956298828125, "logps/rejected": -79.72427368164062, "loss": 0.7374, "rewards/accuracies": 0.0, "rewards/chosen": 4.549917697906494, "rewards/margins": -1.118682861328125, "rewards/rejected": 5.668600559234619, "step": 5257 }, { "epoch": 1.16, "learning_rate": 3.925872124329153e-06, "logits/chosen": -2.0324933528900146, "logits/rejected": -1.9757369756698608, "logps/chosen": -144.83285522460938, "logps/rejected": -103.17091369628906, "loss": 0.137, "rewards/accuracies": 1.0, "rewards/chosen": 8.607751846313477, "rewards/margins": 1.5766072273254395, "rewards/rejected": 7.031144618988037, "step": 5258 }, { "epoch": 1.16, "learning_rate": 3.924121711803205e-06, "logits/chosen": -2.078291416168213, "logits/rejected": -1.7630383968353271, "logps/chosen": -37.161102294921875, "logps/rejected": -61.171142578125, "loss": 0.9704, "rewards/accuracies": 0.0, "rewards/chosen": 3.7666702270507812, "rewards/margins": -1.7094149589538574, "rewards/rejected": 5.476085186004639, "step": 5259 }, { "epoch": 1.16, "learning_rate": 3.922371437525033e-06, "logits/chosen": -2.12589168548584, "logits/rejected": -2.12589168548584, "logps/chosen": -65.08830261230469, "logps/rejected": -65.08830261230469, "loss": 0.4309, "rewards/accuracies": 0.0, "rewards/chosen": 5.557273864746094, "rewards/margins": 0.0, "rewards/rejected": 5.557273864746094, "step": 5260 }, { "epoch": 1.16, "learning_rate": 3.920621301719538e-06, "logits/chosen": -2.0041821002960205, "logits/rejected": -1.9323408603668213, "logps/chosen": -41.33583068847656, "logps/rejected": -16.624221801757812, "loss": 0.3643, "rewards/accuracies": 1.0, "rewards/chosen": 1.552744746208191, "rewards/margins": 0.39421892166137695, "rewards/rejected": 1.158525824546814, "step": 5261 }, { "epoch": 1.16, "learning_rate": 3.918871304611614e-06, "logits/chosen": -2.1606810092926025, "logits/rejected": -2.184023857116699, "logps/chosen": -77.86726379394531, "logps/rejected": -167.0441131591797, "loss": 0.1934, "rewards/accuracies": 1.0, "rewards/chosen": 11.335803031921387, "rewards/margins": 0.7583723068237305, "rewards/rejected": 10.577430725097656, "step": 5262 }, { "epoch": 1.16, "learning_rate": 3.917121446426127e-06, "logits/chosen": -2.0986671447753906, "logits/rejected": -2.048003911972046, "logps/chosen": -52.87842559814453, "logps/rejected": -50.869102478027344, "loss": 0.2519, "rewards/accuracies": 1.0, "rewards/chosen": 4.261547088623047, "rewards/margins": 1.6394591331481934, "rewards/rejected": 2.6220879554748535, "step": 5263 }, { "epoch": 1.17, "learning_rate": 3.915371727387932e-06, "logits/chosen": -1.808341145515442, "logits/rejected": -1.6130141019821167, "logps/chosen": -101.38204193115234, "logps/rejected": -24.254478454589844, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": 8.159940719604492, "rewards/margins": 7.21109676361084, "rewards/rejected": 0.9488441348075867, "step": 5264 }, { "epoch": 1.17, "learning_rate": 3.91362214772186e-06, "logits/chosen": -2.0309293270111084, "logits/rejected": -2.029771089553833, "logps/chosen": -81.45293426513672, "logps/rejected": -91.82235717773438, "loss": 0.0824, "rewards/accuracies": 1.0, "rewards/chosen": 7.308822154998779, "rewards/margins": 1.932572364807129, "rewards/rejected": 5.37624979019165, "step": 5265 }, { "epoch": 1.17, "learning_rate": 3.911872707652734e-06, "logits/chosen": -1.729372262954712, "logits/rejected": -1.690279245376587, "logps/chosen": -62.275184631347656, "logps/rejected": -66.92514038085938, "loss": 0.0889, "rewards/accuracies": 1.0, "rewards/chosen": 3.7989137172698975, "rewards/margins": 2.0010757446289062, "rewards/rejected": 1.7978378534317017, "step": 5266 }, { "epoch": 1.17, "learning_rate": 3.910123407405348e-06, "logits/chosen": -2.104414701461792, "logits/rejected": -2.0713818073272705, "logps/chosen": -56.9163703918457, "logps/rejected": -51.33155822753906, "loss": 0.1996, "rewards/accuracies": 1.0, "rewards/chosen": 4.58845853805542, "rewards/margins": 0.8357729911804199, "rewards/rejected": 3.752685546875, "step": 5267 }, { "epoch": 1.17, "learning_rate": 3.908374247204483e-06, "logits/chosen": -2.0537917613983154, "logits/rejected": -1.9225541353225708, "logps/chosen": -149.25424194335938, "logps/rejected": -20.37273406982422, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 6.146777629852295, "rewards/margins": 6.056589603424072, "rewards/rejected": 0.09018802642822266, "step": 5268 }, { "epoch": 1.17, "learning_rate": 3.906625227274905e-06, "logits/chosen": -2.0977752208709717, "logits/rejected": -2.076423168182373, "logps/chosen": -113.2696762084961, "logps/rejected": -156.38674926757812, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 10.26465892791748, "rewards/margins": 2.4775633811950684, "rewards/rejected": 7.787095546722412, "step": 5269 }, { "epoch": 1.17, "learning_rate": 3.904876347841354e-06, "logits/chosen": -1.9167871475219727, "logits/rejected": -1.9686683416366577, "logps/chosen": -18.45602035522461, "logps/rejected": -66.10981750488281, "loss": 0.621, "rewards/accuracies": 0.0, "rewards/chosen": 2.8406999111175537, "rewards/margins": -0.860870361328125, "rewards/rejected": 3.7015702724456787, "step": 5270 }, { "epoch": 1.17, "learning_rate": 3.903127609128562e-06, "logits/chosen": -1.6486338376998901, "logits/rejected": -1.628516674041748, "logps/chosen": -56.506771087646484, "logps/rejected": -42.369529724121094, "loss": 0.3932, "rewards/accuracies": 1.0, "rewards/chosen": 3.6398518085479736, "rewards/margins": 0.6695828437805176, "rewards/rejected": 2.970268964767456, "step": 5271 }, { "epoch": 1.17, "learning_rate": 3.901379011361234e-06, "logits/chosen": -1.7104034423828125, "logits/rejected": -1.684976577758789, "logps/chosen": -43.81204605102539, "logps/rejected": -62.746856689453125, "loss": 0.3835, "rewards/accuracies": 1.0, "rewards/chosen": 2.3964803218841553, "rewards/margins": 1.109655499458313, "rewards/rejected": 1.2868248224258423, "step": 5272 }, { "epoch": 1.17, "learning_rate": 3.899630554764064e-06, "logits/chosen": -1.6068980693817139, "logits/rejected": -1.5927401781082153, "logps/chosen": -31.777631759643555, "logps/rejected": -40.451507568359375, "loss": 0.8308, "rewards/accuracies": 0.0, "rewards/chosen": 3.0300333499908447, "rewards/margins": -1.2569324970245361, "rewards/rejected": 4.286965847015381, "step": 5273 }, { "epoch": 1.17, "learning_rate": 3.89788223956172e-06, "logits/chosen": -1.8681414127349854, "logits/rejected": -1.7830674648284912, "logps/chosen": -159.28457641601562, "logps/rejected": -60.54383087158203, "loss": 0.3723, "rewards/accuracies": 1.0, "rewards/chosen": 7.281628608703613, "rewards/margins": 2.9412317276000977, "rewards/rejected": 4.340396881103516, "step": 5274 }, { "epoch": 1.17, "learning_rate": 3.896134065978861e-06, "logits/chosen": -2.063816785812378, "logits/rejected": -2.090463638305664, "logps/chosen": -83.25732421875, "logps/rejected": -184.31607055664062, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 11.552258491516113, "rewards/margins": 4.135739326477051, "rewards/rejected": 7.4165191650390625, "step": 5275 }, { "epoch": 1.17, "learning_rate": 3.894386034240121e-06, "logits/chosen": -2.219679594039917, "logits/rejected": -2.2356157302856445, "logps/chosen": -81.09071350097656, "logps/rejected": -144.78338623046875, "loss": 0.14, "rewards/accuracies": 1.0, "rewards/chosen": 9.898091316223145, "rewards/margins": 1.1320323944091797, "rewards/rejected": 8.766058921813965, "step": 5276 }, { "epoch": 1.17, "learning_rate": 3.892638144570116e-06, "logits/chosen": -2.253209114074707, "logits/rejected": -2.253209114074707, "logps/chosen": -17.500225067138672, "logps/rejected": -17.500225067138672, "loss": 0.366, "rewards/accuracies": 0.0, "rewards/chosen": 4.940457820892334, "rewards/margins": 0.0, "rewards/rejected": 4.940457820892334, "step": 5277 }, { "epoch": 1.17, "learning_rate": 3.890890397193449e-06, "logits/chosen": -1.8764050006866455, "logits/rejected": -1.7981575727462769, "logps/chosen": -100.4522705078125, "logps/rejected": -85.04559326171875, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 7.808157444000244, "rewards/margins": 5.751126289367676, "rewards/rejected": 2.0570313930511475, "step": 5278 }, { "epoch": 1.17, "learning_rate": 3.889142792334698e-06, "logits/chosen": -2.09051513671875, "logits/rejected": -2.1398825645446777, "logps/chosen": -43.070159912109375, "logps/rejected": -97.26458740234375, "loss": 0.8264, "rewards/accuracies": 0.0, "rewards/chosen": 5.7225494384765625, "rewards/margins": -1.32550048828125, "rewards/rejected": 7.0480499267578125, "step": 5279 }, { "epoch": 1.17, "learning_rate": 3.887395330218429e-06, "logits/chosen": -1.8712023496627808, "logits/rejected": -1.8365381956100464, "logps/chosen": -92.58274841308594, "logps/rejected": -80.23173522949219, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 10.322761535644531, "rewards/margins": 2.7545623779296875, "rewards/rejected": 7.568199157714844, "step": 5280 }, { "epoch": 1.17, "learning_rate": 3.885648011069184e-06, "logits/chosen": -1.780404806137085, "logits/rejected": -1.780404806137085, "logps/chosen": -11.967074394226074, "logps/rejected": -11.967074394226074, "loss": 2.904, "rewards/accuracies": 0.0, "rewards/chosen": 2.806868076324463, "rewards/margins": 0.0, "rewards/rejected": 2.806868076324463, "step": 5281 }, { "epoch": 1.17, "learning_rate": 3.883900835111493e-06, "logits/chosen": -2.019444227218628, "logits/rejected": -1.9812648296356201, "logps/chosen": -72.32386779785156, "logps/rejected": -40.617103576660156, "loss": 0.2599, "rewards/accuracies": 1.0, "rewards/chosen": 5.316542148590088, "rewards/margins": 2.1333138942718506, "rewards/rejected": 3.1832282543182373, "step": 5282 }, { "epoch": 1.17, "learning_rate": 3.88215380256986e-06, "logits/chosen": -2.4710423946380615, "logits/rejected": -2.4085752964019775, "logps/chosen": -51.85546875, "logps/rejected": -12.080239295959473, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": 5.48134708404541, "rewards/margins": 2.428706407546997, "rewards/rejected": 3.052640676498413, "step": 5283 }, { "epoch": 1.17, "learning_rate": 3.8804069136687775e-06, "logits/chosen": -2.1814725399017334, "logits/rejected": -2.2660164833068848, "logps/chosen": -49.09779357910156, "logps/rejected": -129.15676879882812, "loss": 2.8903, "rewards/accuracies": 0.0, "rewards/chosen": 5.475857734680176, "rewards/margins": -5.775418281555176, "rewards/rejected": 11.251276016235352, "step": 5284 }, { "epoch": 1.17, "learning_rate": 3.878660168632713e-06, "logits/chosen": -1.9139742851257324, "logits/rejected": -1.5022324323654175, "logps/chosen": -185.4718017578125, "logps/rejected": -63.320884704589844, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": 8.018628120422363, "rewards/margins": 2.687725067138672, "rewards/rejected": 5.330903053283691, "step": 5285 }, { "epoch": 1.17, "learning_rate": 3.876913567686123e-06, "logits/chosen": -1.5626378059387207, "logits/rejected": -1.5675160884857178, "logps/chosen": -43.41726303100586, "logps/rejected": -58.186302185058594, "loss": 0.1706, "rewards/accuracies": 1.0, "rewards/chosen": 3.706489324569702, "rewards/margins": 0.9003567695617676, "rewards/rejected": 2.8061325550079346, "step": 5286 }, { "epoch": 1.17, "learning_rate": 3.87516711105344e-06, "logits/chosen": -1.9471805095672607, "logits/rejected": -1.9030050039291382, "logps/chosen": -31.003355026245117, "logps/rejected": -13.271210670471191, "loss": 0.187, "rewards/accuracies": 1.0, "rewards/chosen": 1.80801260471344, "rewards/margins": 1.1288704872131348, "rewards/rejected": 0.6791421175003052, "step": 5287 }, { "epoch": 1.17, "learning_rate": 3.873420798959078e-06, "logits/chosen": -2.0950284004211426, "logits/rejected": -2.0769145488739014, "logps/chosen": -62.22181701660156, "logps/rejected": -88.880126953125, "loss": 0.2236, "rewards/accuracies": 1.0, "rewards/chosen": 3.5990967750549316, "rewards/margins": 0.9320924282073975, "rewards/rejected": 2.667004346847534, "step": 5288 }, { "epoch": 1.17, "learning_rate": 3.871674631627436e-06, "logits/chosen": -1.553850531578064, "logits/rejected": -1.3772727251052856, "logps/chosen": -46.17159652709961, "logps/rejected": -58.23655700683594, "loss": 0.5845, "rewards/accuracies": 1.0, "rewards/chosen": 8.427212715148926, "rewards/margins": 0.7267746925354004, "rewards/rejected": 7.700438022613525, "step": 5289 }, { "epoch": 1.17, "learning_rate": 3.86992860928289e-06, "logits/chosen": -2.0154366493225098, "logits/rejected": -2.028134346008301, "logps/chosen": -57.88505554199219, "logps/rejected": -89.04345703125, "loss": 0.2724, "rewards/accuracies": 1.0, "rewards/chosen": 5.061961650848389, "rewards/margins": 0.4376373291015625, "rewards/rejected": 4.624324321746826, "step": 5290 }, { "epoch": 1.17, "learning_rate": 3.868182732149803e-06, "logits/chosen": -1.8715142011642456, "logits/rejected": -1.8236714601516724, "logps/chosen": -162.710205078125, "logps/rejected": -56.297462463378906, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 7.633096218109131, "rewards/margins": 2.5375523567199707, "rewards/rejected": 5.09554386138916, "step": 5291 }, { "epoch": 1.17, "learning_rate": 3.8664370004525124e-06, "logits/chosen": -1.9633386135101318, "logits/rejected": -1.922552227973938, "logps/chosen": -88.75938415527344, "logps/rejected": -70.25635528564453, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 4.739006042480469, "rewards/margins": 2.5751655101776123, "rewards/rejected": 2.1638405323028564, "step": 5292 }, { "epoch": 1.17, "learning_rate": 3.8646914144153435e-06, "logits/chosen": -1.957186222076416, "logits/rejected": -1.9948241710662842, "logps/chosen": -44.2872428894043, "logps/rejected": -93.81071472167969, "loss": 3.0712, "rewards/accuracies": 0.0, "rewards/chosen": 4.411811351776123, "rewards/margins": -3.796201229095459, "rewards/rejected": 8.208012580871582, "step": 5293 }, { "epoch": 1.17, "learning_rate": 3.862945974262597e-06, "logits/chosen": -1.8248603343963623, "logits/rejected": -1.6808425188064575, "logps/chosen": -129.90066528320312, "logps/rejected": -50.2547721862793, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": 5.377197265625, "rewards/margins": 2.863600492477417, "rewards/rejected": 2.513596773147583, "step": 5294 }, { "epoch": 1.17, "learning_rate": 3.861200680218561e-06, "logits/chosen": -1.8392950296401978, "logits/rejected": -1.7883858680725098, "logps/chosen": -22.564220428466797, "logps/rejected": -7.7115349769592285, "loss": 0.187, "rewards/accuracies": 1.0, "rewards/chosen": 2.745335340499878, "rewards/margins": 1.3352293968200684, "rewards/rejected": 1.4101059436798096, "step": 5295 }, { "epoch": 1.17, "learning_rate": 3.8594555325075e-06, "logits/chosen": -2.3429205417633057, "logits/rejected": -2.3974859714508057, "logps/chosen": -106.4708251953125, "logps/rejected": -149.12486267089844, "loss": 0.0573, "rewards/accuracies": 1.0, "rewards/chosen": 10.219430923461914, "rewards/margins": 2.146224021911621, "rewards/rejected": 8.073206901550293, "step": 5296 }, { "epoch": 1.17, "learning_rate": 3.857710531353661e-06, "logits/chosen": -2.0424797534942627, "logits/rejected": -2.039497137069702, "logps/chosen": -40.61000061035156, "logps/rejected": -51.81298828125, "loss": 0.2144, "rewards/accuracies": 1.0, "rewards/chosen": 3.8437256813049316, "rewards/margins": 1.0260543823242188, "rewards/rejected": 2.817671298980713, "step": 5297 }, { "epoch": 1.17, "learning_rate": 3.8559656769812746e-06, "logits/chosen": -2.069809913635254, "logits/rejected": -2.071953058242798, "logps/chosen": -78.45600891113281, "logps/rejected": -79.84678649902344, "loss": 1.1089, "rewards/accuracies": 0.0, "rewards/chosen": 3.772998094558716, "rewards/margins": -1.9217956066131592, "rewards/rejected": 5.694793701171875, "step": 5298 }, { "epoch": 1.17, "learning_rate": 3.854220969614545e-06, "logits/chosen": -1.833979845046997, "logits/rejected": -1.793378472328186, "logps/chosen": -76.952880859375, "logps/rejected": -144.76625061035156, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": 10.623549461364746, "rewards/margins": 3.017740249633789, "rewards/rejected": 7.605809211730957, "step": 5299 }, { "epoch": 1.17, "learning_rate": 3.852476409477668e-06, "logits/chosen": -1.7893999814987183, "logits/rejected": -1.846709966659546, "logps/chosen": -35.55049514770508, "logps/rejected": -68.62798309326172, "loss": 2.1391, "rewards/accuracies": 0.0, "rewards/chosen": 3.9131131172180176, "rewards/margins": -4.259531497955322, "rewards/rejected": 8.17264461517334, "step": 5300 }, { "epoch": 1.17, "learning_rate": 3.850731996794813e-06, "logits/chosen": -1.9412559270858765, "logits/rejected": -1.8990106582641602, "logps/chosen": -82.42506408691406, "logps/rejected": -113.56629180908203, "loss": 0.1778, "rewards/accuracies": 1.0, "rewards/chosen": 8.881131172180176, "rewards/margins": 2.678055763244629, "rewards/rejected": 6.203075408935547, "step": 5301 }, { "epoch": 1.17, "learning_rate": 3.848987731790136e-06, "logits/chosen": -1.7756212949752808, "logits/rejected": -1.7825157642364502, "logps/chosen": -36.486289978027344, "logps/rejected": -42.144996643066406, "loss": 1.0068, "rewards/accuracies": 0.0, "rewards/chosen": 2.7825798988342285, "rewards/margins": -1.6231231689453125, "rewards/rejected": 4.405703067779541, "step": 5302 }, { "epoch": 1.17, "learning_rate": 3.847243614687765e-06, "logits/chosen": -1.8335165977478027, "logits/rejected": -1.906774640083313, "logps/chosen": -45.01192092895508, "logps/rejected": -107.23953247070312, "loss": 2.1835, "rewards/accuracies": 0.0, "rewards/chosen": 4.255805492401123, "rewards/margins": -4.125368595123291, "rewards/rejected": 8.381174087524414, "step": 5303 }, { "epoch": 1.17, "learning_rate": 3.845499645711821e-06, "logits/chosen": -2.103172540664673, "logits/rejected": -2.0835964679718018, "logps/chosen": -34.650718688964844, "logps/rejected": -62.030338287353516, "loss": 0.2334, "rewards/accuracies": 1.0, "rewards/chosen": 4.207035064697266, "rewards/margins": 0.5264582633972168, "rewards/rejected": 3.680576801300049, "step": 5304 }, { "epoch": 1.17, "learning_rate": 3.843755825086396e-06, "logits/chosen": -1.8620213270187378, "logits/rejected": -1.9033632278442383, "logps/chosen": -47.0706787109375, "logps/rejected": -87.7112808227539, "loss": 0.449, "rewards/accuracies": 0.0, "rewards/chosen": 7.207406520843506, "rewards/margins": -0.351778507232666, "rewards/rejected": 7.559185028076172, "step": 5305 }, { "epoch": 1.17, "learning_rate": 3.842012153035569e-06, "logits/chosen": -1.9410678148269653, "logits/rejected": -1.8968929052352905, "logps/chosen": -58.57927322387695, "logps/rejected": -28.199108123779297, "loss": 0.0848, "rewards/accuracies": 1.0, "rewards/chosen": 2.561143159866333, "rewards/margins": 1.9911446571350098, "rewards/rejected": 0.569998562335968, "step": 5306 }, { "epoch": 1.17, "learning_rate": 3.840268629783398e-06, "logits/chosen": -2.045153856277466, "logits/rejected": -1.974943995475769, "logps/chosen": -110.28074645996094, "logps/rejected": -23.423635482788086, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 7.118557929992676, "rewards/margins": 3.953106641769409, "rewards/rejected": 3.1654512882232666, "step": 5307 }, { "epoch": 1.17, "learning_rate": 3.838525255553916e-06, "logits/chosen": -2.339630603790283, "logits/rejected": -2.305602788925171, "logps/chosen": -79.85614013671875, "logps/rejected": -37.08580017089844, "loss": 0.1852, "rewards/accuracies": 1.0, "rewards/chosen": 5.2454729080200195, "rewards/margins": 0.8321528434753418, "rewards/rejected": 4.413320064544678, "step": 5308 }, { "epoch": 1.18, "learning_rate": 3.83678203057115e-06, "logits/chosen": -2.0653269290924072, "logits/rejected": -1.9927639961242676, "logps/chosen": -63.027503967285156, "logps/rejected": -15.224554061889648, "loss": 0.1063, "rewards/accuracies": 1.0, "rewards/chosen": 3.3530967235565186, "rewards/margins": 2.0994114875793457, "rewards/rejected": 1.2536852359771729, "step": 5309 }, { "epoch": 1.18, "learning_rate": 3.835038955059094e-06, "logits/chosen": -1.7490352392196655, "logits/rejected": -1.6492574214935303, "logps/chosen": -43.42725372314453, "logps/rejected": -19.256784439086914, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 3.9507546424865723, "rewards/margins": 3.2387642860412598, "rewards/rejected": 0.7119903564453125, "step": 5310 }, { "epoch": 1.18, "learning_rate": 3.833296029241734e-06, "logits/chosen": -2.1963589191436768, "logits/rejected": -2.146867036819458, "logps/chosen": -84.63640594482422, "logps/rejected": -72.38117980957031, "loss": 0.3591, "rewards/accuracies": 1.0, "rewards/chosen": 5.454483985900879, "rewards/margins": 0.3185462951660156, "rewards/rejected": 5.135937690734863, "step": 5311 }, { "epoch": 1.18, "learning_rate": 3.8315532533430285e-06, "logits/chosen": -2.0047833919525146, "logits/rejected": -1.9221479892730713, "logps/chosen": -43.54187774658203, "logps/rejected": -19.420944213867188, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": 5.860180854797363, "rewards/margins": 4.574443817138672, "rewards/rejected": 1.2857372760772705, "step": 5312 }, { "epoch": 1.18, "learning_rate": 3.829810627586921e-06, "logits/chosen": -2.0941567420959473, "logits/rejected": -2.1219289302825928, "logps/chosen": -111.7538070678711, "logps/rejected": -190.81719970703125, "loss": 0.0692, "rewards/accuracies": 1.0, "rewards/chosen": 11.900660514831543, "rewards/margins": 2.4837379455566406, "rewards/rejected": 9.416922569274902, "step": 5313 }, { "epoch": 1.18, "learning_rate": 3.828068152197335e-06, "logits/chosen": -1.9352178573608398, "logits/rejected": -1.7690716981887817, "logps/chosen": -60.34941101074219, "logps/rejected": -13.624621391296387, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": 5.556874752044678, "rewards/margins": 3.637861728668213, "rewards/rejected": 1.9190129041671753, "step": 5314 }, { "epoch": 1.18, "learning_rate": 3.826325827398176e-06, "logits/chosen": -2.1802315711975098, "logits/rejected": -2.1607110500335693, "logps/chosen": -62.871482849121094, "logps/rejected": -69.65093231201172, "loss": 0.1269, "rewards/accuracies": 1.0, "rewards/chosen": 4.013520240783691, "rewards/margins": 1.436213731765747, "rewards/rejected": 2.5773065090179443, "step": 5315 }, { "epoch": 1.18, "learning_rate": 3.824583653413324e-06, "logits/chosen": -2.1047160625457764, "logits/rejected": -2.101325750350952, "logps/chosen": -66.53036499023438, "logps/rejected": -73.00049591064453, "loss": 0.1874, "rewards/accuracies": 1.0, "rewards/chosen": 6.621969699859619, "rewards/margins": 2.0956640243530273, "rewards/rejected": 4.526305675506592, "step": 5316 }, { "epoch": 1.18, "learning_rate": 3.822841630466648e-06, "logits/chosen": -2.1655337810516357, "logits/rejected": -2.1655337810516357, "logps/chosen": -22.968326568603516, "logps/rejected": -22.968326568603516, "loss": 0.3631, "rewards/accuracies": 0.0, "rewards/chosen": 4.121560573577881, "rewards/margins": 0.0, "rewards/rejected": 4.121560573577881, "step": 5317 }, { "epoch": 1.18, "learning_rate": 3.821099758781994e-06, "logits/chosen": -1.746026873588562, "logits/rejected": -1.7179793119430542, "logps/chosen": -31.897171020507812, "logps/rejected": -60.054588317871094, "loss": 0.5314, "rewards/accuracies": 0.0, "rewards/chosen": 3.404902696609497, "rewards/margins": -0.223846435546875, "rewards/rejected": 3.628749132156372, "step": 5318 }, { "epoch": 1.18, "learning_rate": 3.819358038583184e-06, "logits/chosen": -2.077039957046509, "logits/rejected": -2.0450892448425293, "logps/chosen": -62.531124114990234, "logps/rejected": -70.7150650024414, "loss": 0.1254, "rewards/accuracies": 1.0, "rewards/chosen": 4.696831226348877, "rewards/margins": 1.8547627925872803, "rewards/rejected": 2.8420684337615967, "step": 5319 }, { "epoch": 1.18, "learning_rate": 3.817616470094031e-06, "logits/chosen": -1.765828013420105, "logits/rejected": -1.669442892074585, "logps/chosen": -56.779563903808594, "logps/rejected": -29.608562469482422, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": 2.835050344467163, "rewards/margins": 2.0537185668945312, "rewards/rejected": 0.7813316583633423, "step": 5320 }, { "epoch": 1.18, "learning_rate": 3.815875053538317e-06, "logits/chosen": -2.1948273181915283, "logits/rejected": -2.179759979248047, "logps/chosen": -52.86384582519531, "logps/rejected": -68.0577621459961, "loss": 0.0917, "rewards/accuracies": 1.0, "rewards/chosen": 3.4899697303771973, "rewards/margins": 1.6779145002365112, "rewards/rejected": 1.812055230140686, "step": 5321 }, { "epoch": 1.18, "learning_rate": 3.8141337891398135e-06, "logits/chosen": -2.21909236907959, "logits/rejected": -2.2390341758728027, "logps/chosen": -35.63162612915039, "logps/rejected": -79.08744812011719, "loss": 1.6891, "rewards/accuracies": 0.0, "rewards/chosen": 4.2126264572143555, "rewards/margins": -2.9676456451416016, "rewards/rejected": 7.180272102355957, "step": 5322 }, { "epoch": 1.18, "learning_rate": 3.8123926771222664e-06, "logits/chosen": -1.7883027791976929, "logits/rejected": -1.7370332479476929, "logps/chosen": -20.007278442382812, "logps/rejected": -75.56669616699219, "loss": 0.4873, "rewards/accuracies": 1.0, "rewards/chosen": 1.9995777606964111, "rewards/margins": 0.36572301387786865, "rewards/rejected": 1.6338547468185425, "step": 5323 }, { "epoch": 1.18, "learning_rate": 3.810651717709406e-06, "logits/chosen": -1.9294310808181763, "logits/rejected": -1.3668280839920044, "logps/chosen": -86.65958404541016, "logps/rejected": -86.34693908691406, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 7.024364471435547, "rewards/margins": 4.660164833068848, "rewards/rejected": 2.3641998767852783, "step": 5324 }, { "epoch": 1.18, "learning_rate": 3.8089109111249394e-06, "logits/chosen": -1.6289602518081665, "logits/rejected": -1.576310634613037, "logps/chosen": -26.21290397644043, "logps/rejected": -20.062976837158203, "loss": 0.5046, "rewards/accuracies": 0.0, "rewards/chosen": 2.1680147647857666, "rewards/margins": -0.5533952713012695, "rewards/rejected": 2.721410036087036, "step": 5325 }, { "epoch": 1.18, "learning_rate": 3.8071702575925594e-06, "logits/chosen": -2.0212814807891846, "logits/rejected": -1.8681472539901733, "logps/chosen": -174.07423400878906, "logps/rejected": -67.35232543945312, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 6.151210308074951, "rewards/margins": 3.978557825088501, "rewards/rejected": 2.17265248298645, "step": 5326 }, { "epoch": 1.18, "learning_rate": 3.805429757335933e-06, "logits/chosen": -1.8487519025802612, "logits/rejected": -1.7239493131637573, "logps/chosen": -54.324100494384766, "logps/rejected": -55.369651794433594, "loss": 0.3322, "rewards/accuracies": 1.0, "rewards/chosen": 4.86103630065918, "rewards/margins": 0.20845842361450195, "rewards/rejected": 4.652577877044678, "step": 5327 }, { "epoch": 1.18, "learning_rate": 3.8036894105787103e-06, "logits/chosen": -2.2811498641967773, "logits/rejected": -2.292308807373047, "logps/chosen": -117.01966094970703, "logps/rejected": -47.9732666015625, "loss": 0.3819, "rewards/accuracies": 1.0, "rewards/chosen": 6.810567378997803, "rewards/margins": 3.016808271408081, "rewards/rejected": 3.7937591075897217, "step": 5328 }, { "epoch": 1.18, "learning_rate": 3.8019492175445237e-06, "logits/chosen": -1.7890068292617798, "logits/rejected": -1.7961878776550293, "logps/chosen": -54.12633514404297, "logps/rejected": -56.60542297363281, "loss": 0.4165, "rewards/accuracies": 1.0, "rewards/chosen": 3.5199906826019287, "rewards/margins": 0.13185596466064453, "rewards/rejected": 3.388134717941284, "step": 5329 }, { "epoch": 1.18, "learning_rate": 3.8002091784569824e-06, "logits/chosen": -2.030458688735962, "logits/rejected": -2.066950798034668, "logps/chosen": -51.028472900390625, "logps/rejected": -85.17198944091797, "loss": 2.9128, "rewards/accuracies": 0.0, "rewards/chosen": 3.5982728004455566, "rewards/margins": -5.372578144073486, "rewards/rejected": 8.970850944519043, "step": 5330 }, { "epoch": 1.18, "learning_rate": 3.7984692935396778e-06, "logits/chosen": -1.7202095985412598, "logits/rejected": -1.7202095985412598, "logps/chosen": -60.743717193603516, "logps/rejected": -60.743717193603516, "loss": 1.1711, "rewards/accuracies": 0.0, "rewards/chosen": 4.371675491333008, "rewards/margins": 0.0, "rewards/rejected": 4.371675491333008, "step": 5331 }, { "epoch": 1.18, "learning_rate": 3.796729563016179e-06, "logits/chosen": -2.054816961288452, "logits/rejected": -2.0679500102996826, "logps/chosen": -49.36587142944336, "logps/rejected": -55.253944396972656, "loss": 0.8562, "rewards/accuracies": 0.0, "rewards/chosen": 4.062524795532227, "rewards/margins": -0.5545139312744141, "rewards/rejected": 4.617038726806641, "step": 5332 }, { "epoch": 1.18, "learning_rate": 3.7949899871100405e-06, "logits/chosen": -1.6067395210266113, "logits/rejected": -1.6067395210266113, "logps/chosen": -6.1829657554626465, "logps/rejected": -6.1829657554626465, "loss": 0.3493, "rewards/accuracies": 0.0, "rewards/chosen": 1.8291399478912354, "rewards/margins": 0.0, "rewards/rejected": 1.8291399478912354, "step": 5333 }, { "epoch": 1.18, "learning_rate": 3.793250566044789e-06, "logits/chosen": -1.8672170639038086, "logits/rejected": -1.6065740585327148, "logps/chosen": -38.5000114440918, "logps/rejected": -173.57095336914062, "loss": 1.6674, "rewards/accuracies": 0.0, "rewards/chosen": 4.0704264640808105, "rewards/margins": -3.1383914947509766, "rewards/rejected": 7.208817958831787, "step": 5334 }, { "epoch": 1.18, "learning_rate": 3.7915113000439423e-06, "logits/chosen": -1.8719534873962402, "logits/rejected": -1.7527186870574951, "logps/chosen": -71.55313873291016, "logps/rejected": -76.51841735839844, "loss": 0.4069, "rewards/accuracies": 1.0, "rewards/chosen": 3.31559157371521, "rewards/margins": 1.319435954093933, "rewards/rejected": 1.9961556196212769, "step": 5335 }, { "epoch": 1.18, "learning_rate": 3.7897721893309857e-06, "logits/chosen": -1.9777488708496094, "logits/rejected": -1.9106606245040894, "logps/chosen": -69.64309692382812, "logps/rejected": -120.38742065429688, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 8.654840469360352, "rewards/margins": 1.9509754180908203, "rewards/rejected": 6.703865051269531, "step": 5336 }, { "epoch": 1.18, "learning_rate": 3.7880332341293958e-06, "logits/chosen": -1.7999709844589233, "logits/rejected": -1.7528730630874634, "logps/chosen": -43.887001037597656, "logps/rejected": -36.319793701171875, "loss": 0.4532, "rewards/accuracies": 0.0, "rewards/chosen": 3.789780378341675, "rewards/margins": -0.13753986358642578, "rewards/rejected": 3.9273202419281006, "step": 5337 }, { "epoch": 1.18, "learning_rate": 3.786294434662622e-06, "logits/chosen": -1.9925801753997803, "logits/rejected": -2.012730121612549, "logps/chosen": -58.99839782714844, "logps/rejected": -89.34803771972656, "loss": 0.5178, "rewards/accuracies": 1.0, "rewards/chosen": 3.527919054031372, "rewards/margins": 0.5856666564941406, "rewards/rejected": 2.9422523975372314, "step": 5338 }, { "epoch": 1.18, "learning_rate": 3.7845557911540943e-06, "logits/chosen": -1.7937942743301392, "logits/rejected": -1.7937942743301392, "logps/chosen": -92.48536682128906, "logps/rejected": -92.48536682128906, "loss": 0.3684, "rewards/accuracies": 0.0, "rewards/chosen": 4.98867654800415, "rewards/margins": 0.0, "rewards/rejected": 4.98867654800415, "step": 5339 }, { "epoch": 1.18, "learning_rate": 3.7828173038272266e-06, "logits/chosen": -1.6134203672409058, "logits/rejected": -1.6696380376815796, "logps/chosen": -10.166348457336426, "logps/rejected": -29.84305763244629, "loss": 0.242, "rewards/accuracies": 1.0, "rewards/chosen": 1.3276866674423218, "rewards/margins": 0.5175485014915466, "rewards/rejected": 0.8101381659507751, "step": 5340 }, { "epoch": 1.18, "learning_rate": 3.781078972905407e-06, "logits/chosen": -2.1293492317199707, "logits/rejected": -2.092745065689087, "logps/chosen": -110.26477813720703, "logps/rejected": -73.67406463623047, "loss": 0.0742, "rewards/accuracies": 1.0, "rewards/chosen": 6.28516149520874, "rewards/margins": 3.0435216426849365, "rewards/rejected": 3.2416398525238037, "step": 5341 }, { "epoch": 1.18, "learning_rate": 3.7793407986120113e-06, "logits/chosen": -2.002197742462158, "logits/rejected": -1.9710533618927002, "logps/chosen": -128.9042510986328, "logps/rejected": -69.76104736328125, "loss": 0.6651, "rewards/accuracies": 0.0, "rewards/chosen": 4.307328701019287, "rewards/margins": -0.9632644653320312, "rewards/rejected": 5.270593166351318, "step": 5342 }, { "epoch": 1.18, "learning_rate": 3.777602781170387e-06, "logits/chosen": -1.9844553470611572, "logits/rejected": -1.949666976928711, "logps/chosen": -50.513885498046875, "logps/rejected": -60.75869369506836, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": 4.0129899978637695, "rewards/margins": 2.592548131942749, "rewards/rejected": 1.4204418659210205, "step": 5343 }, { "epoch": 1.18, "learning_rate": 3.7758649208038685e-06, "logits/chosen": -1.8493092060089111, "logits/rejected": -1.8083055019378662, "logps/chosen": -34.376834869384766, "logps/rejected": -70.45655059814453, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 3.8377270698547363, "rewards/margins": 3.39141583442688, "rewards/rejected": 0.44631120562553406, "step": 5344 }, { "epoch": 1.18, "learning_rate": 3.774127217735763e-06, "logits/chosen": -2.045964002609253, "logits/rejected": -1.9504666328430176, "logps/chosen": -98.96748352050781, "logps/rejected": -26.679325103759766, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 7.929454326629639, "rewards/margins": 5.031105041503906, "rewards/rejected": 2.8983490467071533, "step": 5345 }, { "epoch": 1.18, "learning_rate": 3.772389672189365e-06, "logits/chosen": -2.0444560050964355, "logits/rejected": -2.05133318901062, "logps/chosen": -26.261667251586914, "logps/rejected": -38.854007720947266, "loss": 0.5332, "rewards/accuracies": 0.0, "rewards/chosen": 2.9909117221832275, "rewards/margins": -0.6119322776794434, "rewards/rejected": 3.602843999862671, "step": 5346 }, { "epoch": 1.18, "learning_rate": 3.7706522843879435e-06, "logits/chosen": -1.93535315990448, "logits/rejected": -1.4676536321640015, "logps/chosen": -147.45655822753906, "logps/rejected": -84.27626037597656, "loss": 0.1309, "rewards/accuracies": 1.0, "rewards/chosen": 7.883174419403076, "rewards/margins": 2.658273696899414, "rewards/rejected": 5.224900722503662, "step": 5347 }, { "epoch": 1.18, "learning_rate": 3.7689150545547467e-06, "logits/chosen": -1.9700859785079956, "logits/rejected": -1.9376425743103027, "logps/chosen": -52.987701416015625, "logps/rejected": -35.301666259765625, "loss": 0.1906, "rewards/accuracies": 1.0, "rewards/chosen": 3.523542881011963, "rewards/margins": 0.97031569480896, "rewards/rejected": 2.553227186203003, "step": 5348 }, { "epoch": 1.18, "learning_rate": 3.7671779829130067e-06, "logits/chosen": -1.9388636350631714, "logits/rejected": -1.9850541353225708, "logps/chosen": -76.68304443359375, "logps/rejected": -159.70700073242188, "loss": 0.1481, "rewards/accuracies": 1.0, "rewards/chosen": 9.436914443969727, "rewards/margins": 1.4095335006713867, "rewards/rejected": 8.02738094329834, "step": 5349 }, { "epoch": 1.18, "learning_rate": 3.7654410696859304e-06, "logits/chosen": -1.9842708110809326, "logits/rejected": -1.9611583948135376, "logps/chosen": -29.982532501220703, "logps/rejected": -24.72576904296875, "loss": 0.4807, "rewards/accuracies": 0.0, "rewards/chosen": 2.8844707012176514, "rewards/margins": -0.3913688659667969, "rewards/rejected": 3.2758395671844482, "step": 5350 }, { "epoch": 1.18, "learning_rate": 3.763704315096711e-06, "logits/chosen": -1.8089325428009033, "logits/rejected": -1.8245974779129028, "logps/chosen": -46.994728088378906, "logps/rejected": -79.68402099609375, "loss": 0.2679, "rewards/accuracies": 1.0, "rewards/chosen": 3.663715362548828, "rewards/margins": 1.1899816989898682, "rewards/rejected": 2.47373366355896, "step": 5351 }, { "epoch": 1.18, "learning_rate": 3.761967719368513e-06, "logits/chosen": -1.54056978225708, "logits/rejected": -1.3658040761947632, "logps/chosen": -92.10546112060547, "logps/rejected": -56.10426330566406, "loss": 0.0551, "rewards/accuracies": 1.0, "rewards/chosen": 5.6364264488220215, "rewards/margins": 3.786989450454712, "rewards/rejected": 1.8494369983673096, "step": 5352 }, { "epoch": 1.18, "learning_rate": 3.7602312827244895e-06, "logits/chosen": -1.9742615222930908, "logits/rejected": -1.568574070930481, "logps/chosen": -62.632843017578125, "logps/rejected": -77.64642333984375, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 7.751821994781494, "rewards/margins": 1.3854265213012695, "rewards/rejected": 6.366395473480225, "step": 5353 }, { "epoch": 1.19, "learning_rate": 3.7584950053877646e-06, "logits/chosen": -2.133864402770996, "logits/rejected": -2.1596179008483887, "logps/chosen": -64.17668151855469, "logps/rejected": -84.52825164794922, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 5.596477031707764, "rewards/margins": 2.559875011444092, "rewards/rejected": 3.036602020263672, "step": 5354 }, { "epoch": 1.19, "learning_rate": 3.756758887581447e-06, "logits/chosen": -1.967156171798706, "logits/rejected": -1.895862340927124, "logps/chosen": -49.89238739013672, "logps/rejected": -40.908538818359375, "loss": 0.3137, "rewards/accuracies": 1.0, "rewards/chosen": 3.220618486404419, "rewards/margins": 0.561225175857544, "rewards/rejected": 2.659393310546875, "step": 5355 }, { "epoch": 1.19, "learning_rate": 3.755022929528623e-06, "logits/chosen": -2.122457981109619, "logits/rejected": -2.0288798809051514, "logps/chosen": -132.62489318847656, "logps/rejected": -39.026920318603516, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 6.345826625823975, "rewards/margins": 6.272608280181885, "rewards/rejected": 0.07321815937757492, "step": 5356 }, { "epoch": 1.19, "learning_rate": 3.7532871314523615e-06, "logits/chosen": -2.4883882999420166, "logits/rejected": -2.474346876144409, "logps/chosen": -46.686546325683594, "logps/rejected": -17.02596092224121, "loss": 0.1915, "rewards/accuracies": 1.0, "rewards/chosen": 2.1293227672576904, "rewards/margins": 1.1925559043884277, "rewards/rejected": 0.9367668032646179, "step": 5357 }, { "epoch": 1.19, "learning_rate": 3.751551493575707e-06, "logits/chosen": -1.9497939348220825, "logits/rejected": -2.008742332458496, "logps/chosen": -55.393714904785156, "logps/rejected": -81.0118408203125, "loss": 1.4546, "rewards/accuracies": 0.0, "rewards/chosen": 4.12817907333374, "rewards/margins": -2.1538829803466797, "rewards/rejected": 6.28206205368042, "step": 5358 }, { "epoch": 1.19, "learning_rate": 3.749816016121681e-06, "logits/chosen": -1.754258394241333, "logits/rejected": -1.7136638164520264, "logps/chosen": -46.565738677978516, "logps/rejected": -49.41135025024414, "loss": 0.0844, "rewards/accuracies": 1.0, "rewards/chosen": 4.29718542098999, "rewards/margins": 1.9387660026550293, "rewards/rejected": 2.358419418334961, "step": 5359 }, { "epoch": 1.19, "learning_rate": 3.7480806993132948e-06, "logits/chosen": -2.125652551651001, "logits/rejected": -2.118159294128418, "logps/chosen": -134.04812622070312, "logps/rejected": -46.883033752441406, "loss": 0.0774, "rewards/accuracies": 1.0, "rewards/chosen": 5.648959636688232, "rewards/margins": 3.339411497116089, "rewards/rejected": 2.3095481395721436, "step": 5360 }, { "epoch": 1.19, "learning_rate": 3.7463455433735265e-06, "logits/chosen": -1.629912257194519, "logits/rejected": -1.63154137134552, "logps/chosen": -33.524871826171875, "logps/rejected": -41.23640060424805, "loss": 0.3377, "rewards/accuracies": 1.0, "rewards/chosen": 4.8295488357543945, "rewards/margins": 1.1423063278198242, "rewards/rejected": 3.6872425079345703, "step": 5361 }, { "epoch": 1.19, "learning_rate": 3.744610548525345e-06, "logits/chosen": -1.8529713153839111, "logits/rejected": -1.8570433855056763, "logps/chosen": -54.006126403808594, "logps/rejected": -95.53388214111328, "loss": 0.2405, "rewards/accuracies": 1.0, "rewards/chosen": 3.6813766956329346, "rewards/margins": 0.5224151611328125, "rewards/rejected": 3.158961534500122, "step": 5362 }, { "epoch": 1.19, "learning_rate": 3.7428757149916884e-06, "logits/chosen": -1.698696494102478, "logits/rejected": -1.78326416015625, "logps/chosen": -55.03792953491211, "logps/rejected": -91.21931457519531, "loss": 1.1376, "rewards/accuracies": 0.0, "rewards/chosen": 4.629781246185303, "rewards/margins": -2.1655054092407227, "rewards/rejected": 6.795286655426025, "step": 5363 }, { "epoch": 1.19, "learning_rate": 3.7411410429954815e-06, "logits/chosen": -1.9629155397415161, "logits/rejected": -1.9456931352615356, "logps/chosen": -40.75841522216797, "logps/rejected": -59.23651123046875, "loss": 0.3913, "rewards/accuracies": 0.0, "rewards/chosen": 2.4074501991271973, "rewards/margins": -0.15910792350769043, "rewards/rejected": 2.5665581226348877, "step": 5364 }, { "epoch": 1.19, "learning_rate": 3.739406532759622e-06, "logits/chosen": -1.6278690099716187, "logits/rejected": -1.6328355073928833, "logps/chosen": -28.19937515258789, "logps/rejected": -67.80189514160156, "loss": 1.2202, "rewards/accuracies": 0.0, "rewards/chosen": 2.137784957885742, "rewards/margins": -0.8079662322998047, "rewards/rejected": 2.945751190185547, "step": 5365 }, { "epoch": 1.19, "learning_rate": 3.737672184506995e-06, "logits/chosen": -2.072716474533081, "logits/rejected": -2.0473062992095947, "logps/chosen": -41.83805847167969, "logps/rejected": -74.73789978027344, "loss": 0.4707, "rewards/accuracies": 0.0, "rewards/chosen": 4.687928676605225, "rewards/margins": -0.31202173233032227, "rewards/rejected": 4.999950408935547, "step": 5366 }, { "epoch": 1.19, "learning_rate": 3.735937998460456e-06, "logits/chosen": -1.8111560344696045, "logits/rejected": -1.7895389795303345, "logps/chosen": -59.50384521484375, "logps/rejected": -68.44084930419922, "loss": 2.4514, "rewards/accuracies": 0.0, "rewards/chosen": 3.8028297424316406, "rewards/margins": -0.8548521995544434, "rewards/rejected": 4.657681941986084, "step": 5367 }, { "epoch": 1.19, "learning_rate": 3.7342039748428473e-06, "logits/chosen": -1.8862640857696533, "logits/rejected": -1.8163186311721802, "logps/chosen": -106.14839172363281, "logps/rejected": -77.5415267944336, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 6.594836711883545, "rewards/margins": 3.9401299953460693, "rewards/rejected": 2.6547067165374756, "step": 5368 }, { "epoch": 1.19, "learning_rate": 3.7324701138769846e-06, "logits/chosen": -1.9027061462402344, "logits/rejected": -1.8267849683761597, "logps/chosen": -148.18667602539062, "logps/rejected": -145.79208374023438, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": 9.235574722290039, "rewards/margins": 2.361482620239258, "rewards/rejected": 6.874092102050781, "step": 5369 }, { "epoch": 1.19, "learning_rate": 3.730736415785664e-06, "logits/chosen": -1.8008633852005005, "logits/rejected": -1.8044008016586304, "logps/chosen": -169.93661499023438, "logps/rejected": -69.4126968383789, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": 7.085350036621094, "rewards/margins": 2.1463980674743652, "rewards/rejected": 4.9389519691467285, "step": 5370 }, { "epoch": 1.19, "learning_rate": 3.7290028807916637e-06, "logits/chosen": -1.9873874187469482, "logits/rejected": -1.911681890487671, "logps/chosen": -31.705936431884766, "logps/rejected": -43.43671417236328, "loss": 0.3846, "rewards/accuracies": 0.0, "rewards/chosen": 3.1685009002685547, "rewards/margins": -0.09845399856567383, "rewards/rejected": 3.2669548988342285, "step": 5371 }, { "epoch": 1.19, "learning_rate": 3.727269509117738e-06, "logits/chosen": -2.137299060821533, "logits/rejected": -2.015522003173828, "logps/chosen": -74.80252075195312, "logps/rejected": -32.78606414794922, "loss": 0.4862, "rewards/accuracies": 1.0, "rewards/chosen": 6.351834297180176, "rewards/margins": 3.816586494445801, "rewards/rejected": 2.535247802734375, "step": 5372 }, { "epoch": 1.19, "learning_rate": 3.7255363009866207e-06, "logits/chosen": -1.903892993927002, "logits/rejected": -1.8467966318130493, "logps/chosen": -122.89631652832031, "logps/rejected": -45.14177703857422, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 7.472880840301514, "rewards/margins": 2.910754680633545, "rewards/rejected": 4.562126159667969, "step": 5373 }, { "epoch": 1.19, "learning_rate": 3.7238032566210236e-06, "logits/chosen": -1.958410382270813, "logits/rejected": -1.971017837524414, "logps/chosen": -42.437522888183594, "logps/rejected": -97.20564270019531, "loss": 1.1228, "rewards/accuracies": 0.0, "rewards/chosen": 4.141173839569092, "rewards/margins": -1.4691762924194336, "rewards/rejected": 5.610350131988525, "step": 5374 }, { "epoch": 1.19, "learning_rate": 3.7220703762436423e-06, "logits/chosen": -1.8884382247924805, "logits/rejected": -1.8731977939605713, "logps/chosen": -41.828834533691406, "logps/rejected": -77.351806640625, "loss": 0.1911, "rewards/accuracies": 1.0, "rewards/chosen": 4.619872570037842, "rewards/margins": 1.2521722316741943, "rewards/rejected": 3.3677003383636475, "step": 5375 }, { "epoch": 1.19, "learning_rate": 3.7203376600771435e-06, "logits/chosen": -1.9746794700622559, "logits/rejected": -1.8967097997665405, "logps/chosen": -79.98811340332031, "logps/rejected": -123.34130096435547, "loss": 0.0935, "rewards/accuracies": 1.0, "rewards/chosen": 8.201863288879395, "rewards/margins": 1.765017032623291, "rewards/rejected": 6.4368462562561035, "step": 5376 }, { "epoch": 1.19, "learning_rate": 3.718605108344181e-06, "logits/chosen": -1.6765600442886353, "logits/rejected": -1.8214682340621948, "logps/chosen": -27.249366760253906, "logps/rejected": -120.06283569335938, "loss": 1.7263, "rewards/accuracies": 0.0, "rewards/chosen": 3.309617280960083, "rewards/margins": -3.3442189693450928, "rewards/rejected": 6.653836250305176, "step": 5377 }, { "epoch": 1.19, "learning_rate": 3.7168727212673816e-06, "logits/chosen": -1.8561514616012573, "logits/rejected": -1.844213604927063, "logps/chosen": -44.451438903808594, "logps/rejected": -57.621299743652344, "loss": 0.4702, "rewards/accuracies": 0.0, "rewards/chosen": 4.177062511444092, "rewards/margins": -0.3831920623779297, "rewards/rejected": 4.5602545738220215, "step": 5378 }, { "epoch": 1.19, "learning_rate": 3.7151404990693507e-06, "logits/chosen": -2.0272107124328613, "logits/rejected": -2.0346086025238037, "logps/chosen": -52.37616729736328, "logps/rejected": -64.37055969238281, "loss": 0.1885, "rewards/accuracies": 1.0, "rewards/chosen": 4.546964168548584, "rewards/margins": 0.8537161350250244, "rewards/rejected": 3.6932480335235596, "step": 5379 }, { "epoch": 1.19, "learning_rate": 3.713408441972679e-06, "logits/chosen": -1.692644476890564, "logits/rejected": -1.6124266386032104, "logps/chosen": -54.99614334106445, "logps/rejected": -49.83697509765625, "loss": 0.1581, "rewards/accuracies": 1.0, "rewards/chosen": 1.7273205518722534, "rewards/margins": 1.312339425086975, "rewards/rejected": 0.41498109698295593, "step": 5380 }, { "epoch": 1.19, "learning_rate": 3.7116765501999297e-06, "logits/chosen": -1.7475996017456055, "logits/rejected": -1.724089503288269, "logps/chosen": -23.01485824584961, "logps/rejected": -70.7142105102539, "loss": 0.4966, "rewards/accuracies": 0.0, "rewards/chosen": 1.8592445850372314, "rewards/margins": -0.33132028579711914, "rewards/rejected": 2.1905648708343506, "step": 5381 }, { "epoch": 1.19, "learning_rate": 3.709944823973647e-06, "logits/chosen": -2.0661354064941406, "logits/rejected": -2.0178630352020264, "logps/chosen": -129.5777130126953, "logps/rejected": -88.05224609375, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 7.523187160491943, "rewards/margins": 3.0875043869018555, "rewards/rejected": 4.435682773590088, "step": 5382 }, { "epoch": 1.19, "learning_rate": 3.7082132635163503e-06, "logits/chosen": -1.9779720306396484, "logits/rejected": -1.8520658016204834, "logps/chosen": -141.88784790039062, "logps/rejected": -76.6958999633789, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": 6.514439582824707, "rewards/margins": 2.5830132961273193, "rewards/rejected": 3.9314262866973877, "step": 5383 }, { "epoch": 1.19, "learning_rate": 3.7064818690505465e-06, "logits/chosen": -1.915588140487671, "logits/rejected": -1.8725091218948364, "logps/chosen": -33.949344635009766, "logps/rejected": -24.184106826782227, "loss": 0.121, "rewards/accuracies": 1.0, "rewards/chosen": 2.722323179244995, "rewards/margins": 1.395666241645813, "rewards/rejected": 1.3266569375991821, "step": 5384 }, { "epoch": 1.19, "learning_rate": 3.7047506407987116e-06, "logits/chosen": -2.1010801792144775, "logits/rejected": -1.9979225397109985, "logps/chosen": -155.96630859375, "logps/rejected": -34.682865142822266, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 7.3405914306640625, "rewards/margins": 6.860734939575195, "rewards/rejected": 0.4798564910888672, "step": 5385 }, { "epoch": 1.19, "learning_rate": 3.7030195789833075e-06, "logits/chosen": -1.7556525468826294, "logits/rejected": -1.548646330833435, "logps/chosen": -43.586273193359375, "logps/rejected": -53.04151153564453, "loss": 0.5412, "rewards/accuracies": 0.0, "rewards/chosen": 5.399881839752197, "rewards/margins": -0.6585144996643066, "rewards/rejected": 6.058396339416504, "step": 5386 }, { "epoch": 1.19, "learning_rate": 3.7012886838267683e-06, "logits/chosen": -2.3172857761383057, "logits/rejected": -2.340846061706543, "logps/chosen": -35.5845947265625, "logps/rejected": -72.63003540039062, "loss": 0.2738, "rewards/accuracies": 1.0, "rewards/chosen": 4.698143005371094, "rewards/margins": 1.028376817703247, "rewards/rejected": 3.6697661876678467, "step": 5387 }, { "epoch": 1.19, "learning_rate": 3.699557955551511e-06, "logits/chosen": -1.7013843059539795, "logits/rejected": -1.6048860549926758, "logps/chosen": -139.10826110839844, "logps/rejected": -83.77607727050781, "loss": 0.2659, "rewards/accuracies": 1.0, "rewards/chosen": 6.492082118988037, "rewards/margins": 1.7249541282653809, "rewards/rejected": 4.767127990722656, "step": 5388 }, { "epoch": 1.19, "learning_rate": 3.6978273943799316e-06, "logits/chosen": -1.8086358308792114, "logits/rejected": -1.6984467506408691, "logps/chosen": -56.123844146728516, "logps/rejected": -11.318846702575684, "loss": 0.2186, "rewards/accuracies": 1.0, "rewards/chosen": 3.330143451690674, "rewards/margins": 2.1050825119018555, "rewards/rejected": 1.2250608205795288, "step": 5389 }, { "epoch": 1.19, "learning_rate": 3.6960970005344005e-06, "logits/chosen": -2.072972297668457, "logits/rejected": -2.0663094520568848, "logps/chosen": -56.9964485168457, "logps/rejected": -36.984107971191406, "loss": 0.198, "rewards/accuracies": 1.0, "rewards/chosen": 4.226185321807861, "rewards/margins": 0.7817623615264893, "rewards/rejected": 3.444422960281372, "step": 5390 }, { "epoch": 1.19, "learning_rate": 3.6943667742372714e-06, "logits/chosen": -1.8527008295059204, "logits/rejected": -1.8357384204864502, "logps/chosen": -74.14412689208984, "logps/rejected": -44.76103973388672, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 4.746614933013916, "rewards/margins": 3.1853833198547363, "rewards/rejected": 1.5612316131591797, "step": 5391 }, { "epoch": 1.19, "learning_rate": 3.692636715710871e-06, "logits/chosen": -1.9879913330078125, "logits/rejected": -1.9816913604736328, "logps/chosen": -25.911434173583984, "logps/rejected": -26.093584060668945, "loss": 0.3394, "rewards/accuracies": 1.0, "rewards/chosen": 3.7486560344696045, "rewards/margins": 0.03864765167236328, "rewards/rejected": 3.710008382797241, "step": 5392 }, { "epoch": 1.19, "learning_rate": 3.6909068251775125e-06, "logits/chosen": -1.5517382621765137, "logits/rejected": -1.0308862924575806, "logps/chosen": -59.37013244628906, "logps/rejected": -117.28976440429688, "loss": 0.5689, "rewards/accuracies": 0.0, "rewards/chosen": 3.9991776943206787, "rewards/margins": -0.6914198398590088, "rewards/rejected": 4.6905975341796875, "step": 5393 }, { "epoch": 1.19, "learning_rate": 3.689177102859477e-06, "logits/chosen": -1.7851845026016235, "logits/rejected": -1.7683815956115723, "logps/chosen": -36.75908660888672, "logps/rejected": -56.53791809082031, "loss": 0.6467, "rewards/accuracies": 1.0, "rewards/chosen": 3.886955976486206, "rewards/margins": 1.0416526794433594, "rewards/rejected": 2.8453032970428467, "step": 5394 }, { "epoch": 1.19, "learning_rate": 3.687447548979035e-06, "logits/chosen": -1.8981314897537231, "logits/rejected": -2.0024352073669434, "logps/chosen": -93.57544708251953, "logps/rejected": -94.73474884033203, "loss": 1.5824, "rewards/accuracies": 0.0, "rewards/chosen": 6.4844489097595215, "rewards/margins": -2.9382987022399902, "rewards/rejected": 9.422747611999512, "step": 5395 }, { "epoch": 1.19, "learning_rate": 3.685718163758427e-06, "logits/chosen": -1.8060574531555176, "logits/rejected": -1.7608487606048584, "logps/chosen": -50.83452606201172, "logps/rejected": -50.82036590576172, "loss": 0.0777, "rewards/accuracies": 1.0, "rewards/chosen": 4.202561378479004, "rewards/margins": 2.1026527881622314, "rewards/rejected": 2.0999085903167725, "step": 5396 }, { "epoch": 1.19, "learning_rate": 3.6839889474198755e-06, "logits/chosen": -1.9894518852233887, "logits/rejected": -2.0037434101104736, "logps/chosen": -74.17538452148438, "logps/rejected": -120.78443908691406, "loss": 0.1959, "rewards/accuracies": 1.0, "rewards/chosen": 4.765652656555176, "rewards/margins": 1.0523271560668945, "rewards/rejected": 3.7133255004882812, "step": 5397 }, { "epoch": 1.19, "learning_rate": 3.682259900185582e-06, "logits/chosen": -1.8777272701263428, "logits/rejected": -1.840448260307312, "logps/chosen": -37.250465393066406, "logps/rejected": -26.00501251220703, "loss": 0.4765, "rewards/accuracies": 1.0, "rewards/chosen": 2.3948094844818115, "rewards/margins": 0.3128361701965332, "rewards/rejected": 2.0819733142852783, "step": 5398 }, { "epoch": 1.19, "learning_rate": 3.680531022277721e-06, "logits/chosen": -1.8013051748275757, "logits/rejected": -1.7962497472763062, "logps/chosen": -53.10116958618164, "logps/rejected": -43.57274627685547, "loss": 0.8215, "rewards/accuracies": 0.0, "rewards/chosen": 4.440945148468018, "rewards/margins": -0.7228589057922363, "rewards/rejected": 5.163804054260254, "step": 5399 }, { "epoch": 1.2, "learning_rate": 3.678802313918455e-06, "logits/chosen": -1.7960456609725952, "logits/rejected": -1.7612075805664062, "logps/chosen": -30.570152282714844, "logps/rejected": -48.192726135253906, "loss": 0.192, "rewards/accuracies": 1.0, "rewards/chosen": 4.027498722076416, "rewards/margins": 0.7630424499511719, "rewards/rejected": 3.264456272125244, "step": 5400 }, { "epoch": 1.2, "learning_rate": 3.6770737753299115e-06, "logits/chosen": -1.7296115159988403, "logits/rejected": -1.683258295059204, "logps/chosen": -31.13631248474121, "logps/rejected": -33.525970458984375, "loss": 0.1792, "rewards/accuracies": 1.0, "rewards/chosen": 3.6579155921936035, "rewards/margins": 1.5596771240234375, "rewards/rejected": 2.098238468170166, "step": 5401 }, { "epoch": 1.2, "learning_rate": 3.675345406734211e-06, "logits/chosen": -2.226221799850464, "logits/rejected": -2.164491891860962, "logps/chosen": -132.83120727539062, "logps/rejected": -81.28663635253906, "loss": 0.0414, "rewards/accuracies": 1.0, "rewards/chosen": 7.155064582824707, "rewards/margins": 2.7755980491638184, "rewards/rejected": 4.379466533660889, "step": 5402 }, { "epoch": 1.2, "learning_rate": 3.673617208353438e-06, "logits/chosen": -2.032424211502075, "logits/rejected": -2.0208663940429688, "logps/chosen": -33.01365280151367, "logps/rejected": -42.476383209228516, "loss": 0.2589, "rewards/accuracies": 1.0, "rewards/chosen": 3.600647449493408, "rewards/margins": 0.503380537033081, "rewards/rejected": 3.097266912460327, "step": 5403 }, { "epoch": 1.2, "learning_rate": 3.6718891804096684e-06, "logits/chosen": -2.079157590866089, "logits/rejected": -2.0810256004333496, "logps/chosen": -127.19293975830078, "logps/rejected": -167.8033447265625, "loss": 0.1071, "rewards/accuracies": 1.0, "rewards/chosen": 7.2650933265686035, "rewards/margins": 1.5113458633422852, "rewards/rejected": 5.753747463226318, "step": 5404 }, { "epoch": 1.2, "learning_rate": 3.6701613231249454e-06, "logits/chosen": -2.1369404792785645, "logits/rejected": -2.0717172622680664, "logps/chosen": -91.94810485839844, "logps/rejected": -86.39299011230469, "loss": 0.0589, "rewards/accuracies": 1.0, "rewards/chosen": 8.864100456237793, "rewards/margins": 2.1391139030456543, "rewards/rejected": 6.724986553192139, "step": 5405 }, { "epoch": 1.2, "learning_rate": 3.668433636721296e-06, "logits/chosen": -1.6207493543624878, "logits/rejected": -1.6350290775299072, "logps/chosen": -46.40858459472656, "logps/rejected": -54.844627380371094, "loss": 0.4908, "rewards/accuracies": 0.0, "rewards/chosen": 3.795778751373291, "rewards/margins": -0.5035476684570312, "rewards/rejected": 4.299326419830322, "step": 5406 }, { "epoch": 1.2, "learning_rate": 3.6667061214207213e-06, "logits/chosen": -1.9547022581100464, "logits/rejected": -1.917405128479004, "logps/chosen": -121.97622680664062, "logps/rejected": -117.77127075195312, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 9.297271728515625, "rewards/margins": 2.094067096710205, "rewards/rejected": 7.20320463180542, "step": 5407 }, { "epoch": 1.2, "learning_rate": 3.6649787774452073e-06, "logits/chosen": -2.169630527496338, "logits/rejected": -2.2165687084198, "logps/chosen": -82.18309783935547, "logps/rejected": -173.9806365966797, "loss": 0.1353, "rewards/accuracies": 1.0, "rewards/chosen": 10.480939865112305, "rewards/margins": 1.282064437866211, "rewards/rejected": 9.198875427246094, "step": 5408 }, { "epoch": 1.2, "learning_rate": 3.663251605016711e-06, "logits/chosen": -2.1602509021759033, "logits/rejected": -2.1487081050872803, "logps/chosen": -85.88824462890625, "logps/rejected": -89.14710998535156, "loss": 0.0618, "rewards/accuracies": 1.0, "rewards/chosen": 6.394458293914795, "rewards/margins": 4.193877220153809, "rewards/rejected": 2.2005813121795654, "step": 5409 }, { "epoch": 1.2, "learning_rate": 3.6615246043571674e-06, "logits/chosen": -1.9972871541976929, "logits/rejected": -1.9972871541976929, "logps/chosen": -32.39845275878906, "logps/rejected": -32.39845275878906, "loss": 0.3858, "rewards/accuracies": 0.0, "rewards/chosen": 4.083062648773193, "rewards/margins": 0.0, "rewards/rejected": 4.083062648773193, "step": 5410 }, { "epoch": 1.2, "learning_rate": 3.6597977756884974e-06, "logits/chosen": -1.594283938407898, "logits/rejected": -1.5482814311981201, "logps/chosen": -47.717437744140625, "logps/rejected": -97.32870483398438, "loss": 0.1088, "rewards/accuracies": 1.0, "rewards/chosen": 5.078925609588623, "rewards/margins": 1.7264535427093506, "rewards/rejected": 3.3524720668792725, "step": 5411 }, { "epoch": 1.2, "learning_rate": 3.658071119232589e-06, "logits/chosen": -1.5829720497131348, "logits/rejected": -1.4869979619979858, "logps/chosen": -23.360973358154297, "logps/rejected": -76.73896026611328, "loss": 0.3397, "rewards/accuracies": 1.0, "rewards/chosen": 5.57159948348999, "rewards/margins": 0.09668540954589844, "rewards/rejected": 5.474914073944092, "step": 5412 }, { "epoch": 1.2, "learning_rate": 3.6563446352113176e-06, "logits/chosen": -2.0375545024871826, "logits/rejected": -1.8289228677749634, "logps/chosen": -136.1141815185547, "logps/rejected": -9.52648639678955, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": 5.511390686035156, "rewards/margins": 4.433184623718262, "rewards/rejected": 1.078206181526184, "step": 5413 }, { "epoch": 1.2, "learning_rate": 3.65461832384653e-06, "logits/chosen": -1.8059065341949463, "logits/rejected": -1.7278716564178467, "logps/chosen": -31.31137466430664, "logps/rejected": -20.9398193359375, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 3.2629287242889404, "rewards/margins": 2.5882089138031006, "rewards/rejected": 0.6747198104858398, "step": 5414 }, { "epoch": 1.2, "learning_rate": 3.652892185360054e-06, "logits/chosen": -1.9976601600646973, "logits/rejected": -1.8709207773208618, "logps/chosen": -149.8545379638672, "logps/rejected": -31.77948760986328, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": 6.519784450531006, "rewards/margins": 6.674830913543701, "rewards/rejected": -0.1550462692975998, "step": 5415 }, { "epoch": 1.2, "learning_rate": 3.6511662199736926e-06, "logits/chosen": -1.9944558143615723, "logits/rejected": -2.042752504348755, "logps/chosen": -28.954139709472656, "logps/rejected": -54.648582458496094, "loss": 0.379, "rewards/accuracies": 1.0, "rewards/chosen": 4.620803356170654, "rewards/margins": 0.11370420455932617, "rewards/rejected": 4.507099151611328, "step": 5416 }, { "epoch": 1.2, "learning_rate": 3.649440427909231e-06, "logits/chosen": -1.9163639545440674, "logits/rejected": -1.9922549724578857, "logps/chosen": -31.55188751220703, "logps/rejected": -83.55130004882812, "loss": 0.4654, "rewards/accuracies": 0.0, "rewards/chosen": 4.435342311859131, "rewards/margins": -0.3541750907897949, "rewards/rejected": 4.789517402648926, "step": 5417 }, { "epoch": 1.2, "learning_rate": 3.6477148093884257e-06, "logits/chosen": -1.8444122076034546, "logits/rejected": -1.7880688905715942, "logps/chosen": -143.80532836914062, "logps/rejected": -82.07588195800781, "loss": 0.1018, "rewards/accuracies": 1.0, "rewards/chosen": 7.871452331542969, "rewards/margins": 1.68499755859375, "rewards/rejected": 6.186454772949219, "step": 5418 }, { "epoch": 1.2, "learning_rate": 3.645989364633019e-06, "logits/chosen": -1.9171308279037476, "logits/rejected": -1.8821970224380493, "logps/chosen": -151.9005126953125, "logps/rejected": -114.48278045654297, "loss": 0.1021, "rewards/accuracies": 1.0, "rewards/chosen": 9.49248218536377, "rewards/margins": 3.399585723876953, "rewards/rejected": 6.092896461486816, "step": 5419 }, { "epoch": 1.2, "learning_rate": 3.6442640938647233e-06, "logits/chosen": -1.7230674028396606, "logits/rejected": -1.6771348714828491, "logps/chosen": -41.08502197265625, "logps/rejected": -38.51668930053711, "loss": 0.1165, "rewards/accuracies": 1.0, "rewards/chosen": 3.4413719177246094, "rewards/margins": 1.4392414093017578, "rewards/rejected": 2.0021305084228516, "step": 5420 }, { "epoch": 1.2, "learning_rate": 3.642538997305231e-06, "logits/chosen": -2.1501145362854004, "logits/rejected": -2.02986478805542, "logps/chosen": -43.675331115722656, "logps/rejected": -82.95115661621094, "loss": 0.4184, "rewards/accuracies": 0.0, "rewards/chosen": 5.411563396453857, "rewards/margins": -0.18677520751953125, "rewards/rejected": 5.598338603973389, "step": 5421 }, { "epoch": 1.2, "learning_rate": 3.640814075176216e-06, "logits/chosen": -1.9230552911758423, "logits/rejected": -1.9336780309677124, "logps/chosen": -55.07599639892578, "logps/rejected": -53.19449996948242, "loss": 0.9474, "rewards/accuracies": 0.0, "rewards/chosen": 3.820707082748413, "rewards/margins": -1.5906002521514893, "rewards/rejected": 5.411307334899902, "step": 5422 }, { "epoch": 1.2, "learning_rate": 3.6390893276993243e-06, "logits/chosen": -2.1056711673736572, "logits/rejected": -2.0613789558410645, "logps/chosen": -68.38160705566406, "logps/rejected": -66.1140365600586, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": 7.1313796043396, "rewards/margins": 2.5902347564697266, "rewards/rejected": 4.541144847869873, "step": 5423 }, { "epoch": 1.2, "learning_rate": 3.6373647550961834e-06, "logits/chosen": -1.8661948442459106, "logits/rejected": -1.7666231393814087, "logps/chosen": -41.09130096435547, "logps/rejected": -4.958275318145752, "loss": 0.3879, "rewards/accuracies": 1.0, "rewards/chosen": 2.1658427715301514, "rewards/margins": 1.0759751796722412, "rewards/rejected": 1.0898675918579102, "step": 5424 }, { "epoch": 1.2, "learning_rate": 3.6356403575883937e-06, "logits/chosen": -1.9086495637893677, "logits/rejected": -1.882659912109375, "logps/chosen": -75.68858337402344, "logps/rejected": -51.083099365234375, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 8.171226501464844, "rewards/margins": 4.178911209106445, "rewards/rejected": 3.9923150539398193, "step": 5425 }, { "epoch": 1.2, "learning_rate": 3.63391613539754e-06, "logits/chosen": -2.1752376556396484, "logits/rejected": -2.170586347579956, "logps/chosen": -31.239402770996094, "logps/rejected": -78.30026245117188, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": 3.853015899658203, "rewards/margins": 1.3182075023651123, "rewards/rejected": 2.534808397293091, "step": 5426 }, { "epoch": 1.2, "learning_rate": 3.6321920887451766e-06, "logits/chosen": -2.045565128326416, "logits/rejected": -1.9437930583953857, "logps/chosen": -118.41958618164062, "logps/rejected": -31.04791831970215, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 6.547101020812988, "rewards/margins": 2.9835832118988037, "rewards/rejected": 3.5635178089141846, "step": 5427 }, { "epoch": 1.2, "learning_rate": 3.6304682178528445e-06, "logits/chosen": -1.7443500757217407, "logits/rejected": -1.769056797027588, "logps/chosen": -52.612083435058594, "logps/rejected": -69.48595428466797, "loss": 0.7212, "rewards/accuracies": 0.0, "rewards/chosen": 3.3388054370880127, "rewards/margins": -0.9196703433990479, "rewards/rejected": 4.2584757804870605, "step": 5428 }, { "epoch": 1.2, "learning_rate": 3.6287445229420535e-06, "logits/chosen": -1.8044649362564087, "logits/rejected": -1.6589330434799194, "logps/chosen": -55.54962158203125, "logps/rejected": -15.776198387145996, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 3.916527509689331, "rewards/margins": 3.6233465671539307, "rewards/rejected": 0.2931809425354004, "step": 5429 }, { "epoch": 1.2, "learning_rate": 3.6270210042342937e-06, "logits/chosen": -1.584241271018982, "logits/rejected": -1.584241271018982, "logps/chosen": -36.893585205078125, "logps/rejected": -36.893585205078125, "loss": 0.5034, "rewards/accuracies": 0.0, "rewards/chosen": 1.8469871282577515, "rewards/margins": 0.0, "rewards/rejected": 1.8469871282577515, "step": 5430 }, { "epoch": 1.2, "learning_rate": 3.6252976619510356e-06, "logits/chosen": -1.8564682006835938, "logits/rejected": -1.8167799711227417, "logps/chosen": -55.60138702392578, "logps/rejected": -39.021583557128906, "loss": 0.2146, "rewards/accuracies": 1.0, "rewards/chosen": 4.910394191741943, "rewards/margins": 0.6387944221496582, "rewards/rejected": 4.271599769592285, "step": 5431 }, { "epoch": 1.2, "learning_rate": 3.623574496313722e-06, "logits/chosen": -1.9776419401168823, "logits/rejected": -1.9534869194030762, "logps/chosen": -57.772727966308594, "logps/rejected": -38.602333068847656, "loss": 0.0981, "rewards/accuracies": 1.0, "rewards/chosen": 3.790300130844116, "rewards/margins": 2.036004066467285, "rewards/rejected": 1.7542961835861206, "step": 5432 }, { "epoch": 1.2, "learning_rate": 3.6218515075437786e-06, "logits/chosen": -2.149881601333618, "logits/rejected": -1.749854564666748, "logps/chosen": -67.70823669433594, "logps/rejected": -66.4182357788086, "loss": 0.2175, "rewards/accuracies": 1.0, "rewards/chosen": 3.743786573410034, "rewards/margins": 0.62978196144104, "rewards/rejected": 3.114004611968994, "step": 5433 }, { "epoch": 1.2, "learning_rate": 3.6201286958626013e-06, "logits/chosen": -1.5878000259399414, "logits/rejected": -1.5536139011383057, "logps/chosen": -140.70693969726562, "logps/rejected": -54.72792434692383, "loss": 0.1151, "rewards/accuracies": 1.0, "rewards/chosen": 7.474847316741943, "rewards/margins": 1.433145523071289, "rewards/rejected": 6.041701793670654, "step": 5434 }, { "epoch": 1.2, "learning_rate": 3.618406061491571e-06, "logits/chosen": -1.9058406352996826, "logits/rejected": -1.889918327331543, "logps/chosen": -30.89710807800293, "logps/rejected": -73.09341430664062, "loss": 0.4176, "rewards/accuracies": 0.0, "rewards/chosen": 4.0923004150390625, "rewards/margins": -0.23917245864868164, "rewards/rejected": 4.331472873687744, "step": 5435 }, { "epoch": 1.2, "learning_rate": 3.6166836046520386e-06, "logits/chosen": -1.7892569303512573, "logits/rejected": -1.7507102489471436, "logps/chosen": -86.74623107910156, "logps/rejected": -52.107704162597656, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 9.069595336914062, "rewards/margins": 2.783499240875244, "rewards/rejected": 6.286096096038818, "step": 5436 }, { "epoch": 1.2, "learning_rate": 3.6149613255653406e-06, "logits/chosen": -1.8953816890716553, "logits/rejected": -1.9348005056381226, "logps/chosen": -31.727981567382812, "logps/rejected": -102.07589721679688, "loss": 1.2733, "rewards/accuracies": 0.0, "rewards/chosen": 4.534081935882568, "rewards/margins": -2.4498720169067383, "rewards/rejected": 6.983953952789307, "step": 5437 }, { "epoch": 1.2, "learning_rate": 3.61323922445278e-06, "logits/chosen": -1.8242021799087524, "logits/rejected": -1.430989384651184, "logps/chosen": -29.569324493408203, "logps/rejected": -34.85431671142578, "loss": 0.1119, "rewards/accuracies": 1.0, "rewards/chosen": 4.346832752227783, "rewards/margins": 1.3993115425109863, "rewards/rejected": 2.947521209716797, "step": 5438 }, { "epoch": 1.2, "learning_rate": 3.6115173015356476e-06, "logits/chosen": -2.240910768508911, "logits/rejected": -2.240910768508911, "logps/chosen": -35.12139892578125, "logps/rejected": -35.12139892578125, "loss": 0.4462, "rewards/accuracies": 0.0, "rewards/chosen": 5.398316860198975, "rewards/margins": 0.0, "rewards/rejected": 5.398316860198975, "step": 5439 }, { "epoch": 1.2, "learning_rate": 3.6097955570352027e-06, "logits/chosen": -1.876842975616455, "logits/rejected": -1.876842975616455, "logps/chosen": -28.352685928344727, "logps/rejected": -28.352685928344727, "loss": 0.6787, "rewards/accuracies": 0.0, "rewards/chosen": 2.646237850189209, "rewards/margins": 0.0, "rewards/rejected": 2.646237850189209, "step": 5440 }, { "epoch": 1.2, "learning_rate": 3.608073991172687e-06, "logits/chosen": -2.0161027908325195, "logits/rejected": -1.7336972951889038, "logps/chosen": -82.85325622558594, "logps/rejected": -22.82048797607422, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": 7.129690647125244, "rewards/margins": 5.912109851837158, "rewards/rejected": 1.217580795288086, "step": 5441 }, { "epoch": 1.2, "learning_rate": 3.606352604169319e-06, "logits/chosen": -2.1925811767578125, "logits/rejected": -2.1623573303222656, "logps/chosen": -44.50581359863281, "logps/rejected": -110.91098022460938, "loss": 0.2327, "rewards/accuracies": 1.0, "rewards/chosen": 5.453397274017334, "rewards/margins": 0.5357885360717773, "rewards/rejected": 4.917608737945557, "step": 5442 }, { "epoch": 1.2, "learning_rate": 3.6046313962462877e-06, "logits/chosen": -2.0269532203674316, "logits/rejected": -1.8816648721694946, "logps/chosen": -89.2371597290039, "logps/rejected": -243.98019409179688, "loss": 0.3336, "rewards/accuracies": 1.0, "rewards/chosen": 11.07668399810791, "rewards/margins": 0.07636356353759766, "rewards/rejected": 11.000320434570312, "step": 5443 }, { "epoch": 1.2, "learning_rate": 3.60291036762477e-06, "logits/chosen": -1.4794822931289673, "logits/rejected": -1.5004968643188477, "logps/chosen": -79.40861511230469, "logps/rejected": -97.63203430175781, "loss": 0.5797, "rewards/accuracies": 0.0, "rewards/chosen": 8.872654914855957, "rewards/margins": -0.68524169921875, "rewards/rejected": 9.557896614074707, "step": 5444 }, { "epoch": 1.21, "learning_rate": 3.6011895185259103e-06, "logits/chosen": -1.8881911039352417, "logits/rejected": -1.9122666120529175, "logps/chosen": -25.186214447021484, "logps/rejected": -45.722660064697266, "loss": 2.0744, "rewards/accuracies": 0.0, "rewards/chosen": 2.9155189990997314, "rewards/margins": -2.0817434787750244, "rewards/rejected": 4.997262477874756, "step": 5445 }, { "epoch": 1.21, "learning_rate": 3.5994688491708364e-06, "logits/chosen": -2.071892023086548, "logits/rejected": -2.0481691360473633, "logps/chosen": -91.83296203613281, "logps/rejected": -72.865234375, "loss": 0.1912, "rewards/accuracies": 1.0, "rewards/chosen": 6.06448221206665, "rewards/margins": 0.9084534645080566, "rewards/rejected": 5.156028747558594, "step": 5446 }, { "epoch": 1.21, "learning_rate": 3.5977483597806472e-06, "logits/chosen": -1.4867628812789917, "logits/rejected": -1.4143179655075073, "logps/chosen": -69.15766906738281, "logps/rejected": -44.45100784301758, "loss": 0.1155, "rewards/accuracies": 1.0, "rewards/chosen": 3.7758805751800537, "rewards/margins": 1.4102742671966553, "rewards/rejected": 2.3656063079833984, "step": 5447 }, { "epoch": 1.21, "learning_rate": 3.5960280505764244e-06, "logits/chosen": -1.7511065006256104, "logits/rejected": -1.699974775314331, "logps/chosen": -71.70529174804688, "logps/rejected": -36.70710754394531, "loss": 0.173, "rewards/accuracies": 1.0, "rewards/chosen": 5.22335147857666, "rewards/margins": 1.780921220779419, "rewards/rejected": 3.442430257797241, "step": 5448 }, { "epoch": 1.21, "learning_rate": 3.594307921779221e-06, "logits/chosen": -1.8345187902450562, "logits/rejected": -1.8345187902450562, "logps/chosen": -22.0775146484375, "logps/rejected": -22.0775146484375, "loss": 0.3859, "rewards/accuracies": 0.0, "rewards/chosen": 4.114329814910889, "rewards/margins": 0.0, "rewards/rejected": 4.114329814910889, "step": 5449 }, { "epoch": 1.21, "learning_rate": 3.59258797361007e-06, "logits/chosen": -2.074680805206299, "logits/rejected": -1.9618619680404663, "logps/chosen": -118.96464538574219, "logps/rejected": -45.774253845214844, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": 8.444007873535156, "rewards/margins": 6.1204352378845215, "rewards/rejected": 2.3235726356506348, "step": 5450 }, { "epoch": 1.21, "learning_rate": 3.5908682062899824e-06, "logits/chosen": -1.8657968044281006, "logits/rejected": -1.8156780004501343, "logps/chosen": -31.06364631652832, "logps/rejected": -86.54483032226562, "loss": 0.1926, "rewards/accuracies": 1.0, "rewards/chosen": 3.55839467048645, "rewards/margins": 0.8973076343536377, "rewards/rejected": 2.6610870361328125, "step": 5451 }, { "epoch": 1.21, "learning_rate": 3.5891486200399413e-06, "logits/chosen": -1.7924596071243286, "logits/rejected": -1.7647894620895386, "logps/chosen": -59.97650909423828, "logps/rejected": -59.67897033691406, "loss": 0.101, "rewards/accuracies": 1.0, "rewards/chosen": 4.603118419647217, "rewards/margins": 1.6483240127563477, "rewards/rejected": 2.954794406890869, "step": 5452 }, { "epoch": 1.21, "learning_rate": 3.587429215080912e-06, "logits/chosen": -2.123701572418213, "logits/rejected": -2.1148459911346436, "logps/chosen": -48.419944763183594, "logps/rejected": -99.15908813476562, "loss": 0.0936, "rewards/accuracies": 1.0, "rewards/chosen": 5.479917049407959, "rewards/margins": 2.8424384593963623, "rewards/rejected": 2.6374785900115967, "step": 5453 }, { "epoch": 1.21, "learning_rate": 3.585709991633831e-06, "logits/chosen": -1.9647469520568848, "logits/rejected": -2.009622812271118, "logps/chosen": -48.000614166259766, "logps/rejected": -91.75912475585938, "loss": 0.3202, "rewards/accuracies": 1.0, "rewards/chosen": 4.30436897277832, "rewards/margins": 0.10889148712158203, "rewards/rejected": 4.195477485656738, "step": 5454 }, { "epoch": 1.21, "learning_rate": 3.583990949919619e-06, "logits/chosen": -1.8017253875732422, "logits/rejected": -1.6446939706802368, "logps/chosen": -106.5777587890625, "logps/rejected": -38.74761962890625, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 5.7064666748046875, "rewards/margins": 3.16522216796875, "rewards/rejected": 2.5412445068359375, "step": 5455 }, { "epoch": 1.21, "learning_rate": 3.5822720901591644e-06, "logits/chosen": -2.044829845428467, "logits/rejected": -2.0336251258850098, "logps/chosen": -116.17308044433594, "logps/rejected": -57.345035552978516, "loss": 0.0447, "rewards/accuracies": 1.0, "rewards/chosen": 7.534999370574951, "rewards/margins": 2.8173770904541016, "rewards/rejected": 4.71762228012085, "step": 5456 }, { "epoch": 1.21, "learning_rate": 3.5805534125733393e-06, "logits/chosen": -2.0230259895324707, "logits/rejected": -2.0350213050842285, "logps/chosen": -56.890052795410156, "logps/rejected": -70.78409576416016, "loss": 0.2695, "rewards/accuracies": 1.0, "rewards/chosen": 3.8935303688049316, "rewards/margins": 0.33876872062683105, "rewards/rejected": 3.5547616481781006, "step": 5457 }, { "epoch": 1.21, "learning_rate": 3.5788349173829862e-06, "logits/chosen": -1.8193529844284058, "logits/rejected": -1.850473403930664, "logps/chosen": -69.42254638671875, "logps/rejected": -51.86105728149414, "loss": 0.867, "rewards/accuracies": 0.0, "rewards/chosen": 3.342514753341675, "rewards/margins": -1.1855151653289795, "rewards/rejected": 4.528029918670654, "step": 5458 }, { "epoch": 1.21, "learning_rate": 3.577116604808932e-06, "logits/chosen": -1.7425906658172607, "logits/rejected": -1.7425906658172607, "logps/chosen": -64.14485168457031, "logps/rejected": -64.14485168457031, "loss": 0.4681, "rewards/accuracies": 0.0, "rewards/chosen": 2.8067047595977783, "rewards/margins": 0.0, "rewards/rejected": 2.8067047595977783, "step": 5459 }, { "epoch": 1.21, "learning_rate": 3.5753984750719734e-06, "logits/chosen": -2.1220083236694336, "logits/rejected": -2.108665704727173, "logps/chosen": -84.24315643310547, "logps/rejected": -90.06547546386719, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 8.760059356689453, "rewards/margins": 3.6757664680480957, "rewards/rejected": 5.084292888641357, "step": 5460 }, { "epoch": 1.21, "learning_rate": 3.5736805283928842e-06, "logits/chosen": -1.9433242082595825, "logits/rejected": -1.7105534076690674, "logps/chosen": -72.4686279296875, "logps/rejected": -63.13499450683594, "loss": 0.2198, "rewards/accuracies": 1.0, "rewards/chosen": 5.272909641265869, "rewards/margins": 1.154158115386963, "rewards/rejected": 4.118751525878906, "step": 5461 }, { "epoch": 1.21, "learning_rate": 3.5719627649924205e-06, "logits/chosen": -1.776708960533142, "logits/rejected": -1.7592171430587769, "logps/chosen": -170.5503387451172, "logps/rejected": -108.15210723876953, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 8.388542175292969, "rewards/margins": 3.300194263458252, "rewards/rejected": 5.088347911834717, "step": 5462 }, { "epoch": 1.21, "learning_rate": 3.570245185091306e-06, "logits/chosen": -1.9107460975646973, "logits/rejected": -1.8373950719833374, "logps/chosen": -35.464134216308594, "logps/rejected": -48.86893844604492, "loss": 0.2317, "rewards/accuracies": 1.0, "rewards/chosen": 2.7258613109588623, "rewards/margins": 0.5845179557800293, "rewards/rejected": 2.141343355178833, "step": 5463 }, { "epoch": 1.21, "learning_rate": 3.568527788910251e-06, "logits/chosen": -1.7440131902694702, "logits/rejected": -1.6726412773132324, "logps/chosen": -54.398468017578125, "logps/rejected": -62.360130310058594, "loss": 0.1806, "rewards/accuracies": 1.0, "rewards/chosen": 2.1652801036834717, "rewards/margins": 2.3103137016296387, "rewards/rejected": -0.14503364264965057, "step": 5464 }, { "epoch": 1.21, "learning_rate": 3.5668105766699325e-06, "logits/chosen": -2.163255214691162, "logits/rejected": -2.1578381061553955, "logps/chosen": -51.875450134277344, "logps/rejected": -42.37046813964844, "loss": 0.733, "rewards/accuracies": 0.0, "rewards/chosen": 4.3929643630981445, "rewards/margins": -1.1975302696228027, "rewards/rejected": 5.590494632720947, "step": 5465 }, { "epoch": 1.21, "learning_rate": 3.5650935485910103e-06, "logits/chosen": -1.7617918252944946, "logits/rejected": -1.7171556949615479, "logps/chosen": -72.48703002929688, "logps/rejected": -74.91078186035156, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": 5.943783760070801, "rewards/margins": 2.4739465713500977, "rewards/rejected": 3.469837188720703, "step": 5466 }, { "epoch": 1.21, "learning_rate": 3.563376704894116e-06, "logits/chosen": -1.7013516426086426, "logits/rejected": -1.7013516426086426, "logps/chosen": -54.83195877075195, "logps/rejected": -54.83195877075195, "loss": 0.4125, "rewards/accuracies": 0.0, "rewards/chosen": 7.8266496658325195, "rewards/margins": 0.0, "rewards/rejected": 7.8266496658325195, "step": 5467 }, { "epoch": 1.21, "learning_rate": 3.5616600457998633e-06, "logits/chosen": -2.1395022869110107, "logits/rejected": -2.1773312091827393, "logps/chosen": -64.1693344116211, "logps/rejected": -62.67689895629883, "loss": 0.2102, "rewards/accuracies": 1.0, "rewards/chosen": 4.302254676818848, "rewards/margins": 0.8456737995147705, "rewards/rejected": 3.456580877304077, "step": 5468 }, { "epoch": 1.21, "learning_rate": 3.5599435715288355e-06, "logits/chosen": -2.12764048576355, "logits/rejected": -1.9676764011383057, "logps/chosen": -111.59519958496094, "logps/rejected": -35.56782150268555, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 7.828593730926514, "rewards/margins": 5.704831123352051, "rewards/rejected": 2.123762845993042, "step": 5469 }, { "epoch": 1.21, "learning_rate": 3.558227282301599e-06, "logits/chosen": -1.867772102355957, "logits/rejected": -1.8190131187438965, "logps/chosen": -42.76435089111328, "logps/rejected": -42.70098114013672, "loss": 0.3119, "rewards/accuracies": 1.0, "rewards/chosen": 4.425118923187256, "rewards/margins": 0.25794124603271484, "rewards/rejected": 4.167177677154541, "step": 5470 }, { "epoch": 1.21, "learning_rate": 3.5565111783386906e-06, "logits/chosen": -1.853864312171936, "logits/rejected": -1.8061305284500122, "logps/chosen": -68.79161071777344, "logps/rejected": -51.46900177001953, "loss": 0.3603, "rewards/accuracies": 1.0, "rewards/chosen": 1.9837814569473267, "rewards/margins": 0.07876968383789062, "rewards/rejected": 1.905011773109436, "step": 5471 }, { "epoch": 1.21, "learning_rate": 3.5547952598606245e-06, "logits/chosen": -1.9919360876083374, "logits/rejected": -1.9439303874969482, "logps/chosen": -39.0018310546875, "logps/rejected": -71.2752456665039, "loss": 0.2298, "rewards/accuracies": 1.0, "rewards/chosen": 5.998594760894775, "rewards/margins": 0.7545433044433594, "rewards/rejected": 5.244051456451416, "step": 5472 }, { "epoch": 1.21, "learning_rate": 3.5530795270878935e-06, "logits/chosen": -1.5962709188461304, "logits/rejected": -1.853024959564209, "logps/chosen": -10.516681671142578, "logps/rejected": -63.797607421875, "loss": 0.9044, "rewards/accuracies": 0.0, "rewards/chosen": 1.1673921346664429, "rewards/margins": -1.4307218790054321, "rewards/rejected": 2.598114013671875, "step": 5473 }, { "epoch": 1.21, "learning_rate": 3.551363980240965e-06, "logits/chosen": -2.2571732997894287, "logits/rejected": -2.2521610260009766, "logps/chosen": -67.50578308105469, "logps/rejected": -10.503169059753418, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 4.533786296844482, "rewards/margins": 3.7131199836730957, "rewards/rejected": 0.8206664323806763, "step": 5474 }, { "epoch": 1.21, "learning_rate": 3.5496486195402835e-06, "logits/chosen": -2.2060470581054688, "logits/rejected": -2.195955514907837, "logps/chosen": -49.69291687011719, "logps/rejected": -69.2808837890625, "loss": 0.1785, "rewards/accuracies": 1.0, "rewards/chosen": 5.072044372558594, "rewards/margins": 1.6678962707519531, "rewards/rejected": 3.4041481018066406, "step": 5475 }, { "epoch": 1.21, "learning_rate": 3.5479334452062665e-06, "logits/chosen": -1.7494769096374512, "logits/rejected": -1.8310589790344238, "logps/chosen": -56.412559509277344, "logps/rejected": -103.41075134277344, "loss": 0.1785, "rewards/accuracies": 1.0, "rewards/chosen": 7.839417457580566, "rewards/margins": 0.8470511436462402, "rewards/rejected": 6.992366313934326, "step": 5476 }, { "epoch": 1.21, "learning_rate": 3.546218457459312e-06, "logits/chosen": -1.694923996925354, "logits/rejected": -1.6778466701507568, "logps/chosen": -33.653079986572266, "logps/rejected": -45.11186981201172, "loss": 0.2035, "rewards/accuracies": 1.0, "rewards/chosen": 4.570413112640381, "rewards/margins": 2.0323750972747803, "rewards/rejected": 2.5380380153656006, "step": 5477 }, { "epoch": 1.21, "learning_rate": 3.54450365651979e-06, "logits/chosen": -2.0022788047790527, "logits/rejected": -2.0022788047790527, "logps/chosen": -3.4684152603149414, "logps/rejected": -3.4684152603149414, "loss": 2.6747, "rewards/accuracies": 0.0, "rewards/chosen": 0.7196750044822693, "rewards/margins": 0.0, "rewards/rejected": 0.7196750044822693, "step": 5478 }, { "epoch": 1.21, "learning_rate": 3.5427890426080503e-06, "logits/chosen": -2.1888587474823, "logits/rejected": -2.1796560287475586, "logps/chosen": -62.66249465942383, "logps/rejected": -76.58540344238281, "loss": 0.2206, "rewards/accuracies": 1.0, "rewards/chosen": 5.372941970825195, "rewards/margins": 0.6098332405090332, "rewards/rejected": 4.763108730316162, "step": 5479 }, { "epoch": 1.21, "learning_rate": 3.5410746159444165e-06, "logits/chosen": -1.975598692893982, "logits/rejected": -1.9960949420928955, "logps/chosen": -29.055706024169922, "logps/rejected": -47.253761291503906, "loss": 0.257, "rewards/accuracies": 1.0, "rewards/chosen": 2.8729088306427, "rewards/margins": 0.547518253326416, "rewards/rejected": 2.325390577316284, "step": 5480 }, { "epoch": 1.21, "learning_rate": 3.5393603767491856e-06, "logits/chosen": -1.9020811319351196, "logits/rejected": -1.9276748895645142, "logps/chosen": -63.85564422607422, "logps/rejected": -91.14443969726562, "loss": 0.711, "rewards/accuracies": 0.0, "rewards/chosen": 8.923712730407715, "rewards/margins": -1.1423454284667969, "rewards/rejected": 10.066058158874512, "step": 5481 }, { "epoch": 1.21, "learning_rate": 3.537646325242635e-06, "logits/chosen": -1.6660728454589844, "logits/rejected": -1.7694764137268066, "logps/chosen": -24.99811553955078, "logps/rejected": -107.71672058105469, "loss": 1.5361, "rewards/accuracies": 0.0, "rewards/chosen": 3.743572235107422, "rewards/margins": -3.0119376182556152, "rewards/rejected": 6.755509853363037, "step": 5482 }, { "epoch": 1.21, "learning_rate": 3.535932461645017e-06, "logits/chosen": -2.219876766204834, "logits/rejected": -2.1532742977142334, "logps/chosen": -89.61991882324219, "logps/rejected": -89.47439575195312, "loss": 0.1557, "rewards/accuracies": 1.0, "rewards/chosen": 7.3187150955200195, "rewards/margins": 1.0156054496765137, "rewards/rejected": 6.303109645843506, "step": 5483 }, { "epoch": 1.21, "learning_rate": 3.5342187861765587e-06, "logits/chosen": -1.8756959438323975, "logits/rejected": -1.8756959438323975, "logps/chosen": -34.87727355957031, "logps/rejected": -34.87727355957031, "loss": 0.5559, "rewards/accuracies": 0.0, "rewards/chosen": 2.518073320388794, "rewards/margins": 0.0, "rewards/rejected": 2.518073320388794, "step": 5484 }, { "epoch": 1.21, "learning_rate": 3.5325052990574605e-06, "logits/chosen": -2.0819408893585205, "logits/rejected": -2.0612952709198, "logps/chosen": -70.50265502929688, "logps/rejected": -63.00276565551758, "loss": 0.1366, "rewards/accuracies": 1.0, "rewards/chosen": 4.364841461181641, "rewards/margins": 1.1782207489013672, "rewards/rejected": 3.1866207122802734, "step": 5485 }, { "epoch": 1.21, "learning_rate": 3.530792000507906e-06, "logits/chosen": -1.6570926904678345, "logits/rejected": -1.5304638147354126, "logps/chosen": -58.35028839111328, "logps/rejected": -8.397329330444336, "loss": 0.3929, "rewards/accuracies": 1.0, "rewards/chosen": 3.45780873298645, "rewards/margins": 2.331522226333618, "rewards/rejected": 1.126286506652832, "step": 5486 }, { "epoch": 1.21, "learning_rate": 3.5290788907480454e-06, "logits/chosen": -2.126952648162842, "logits/rejected": -2.170008420944214, "logps/chosen": -110.49873352050781, "logps/rejected": -69.27301025390625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 10.200179100036621, "rewards/margins": 7.1050801277160645, "rewards/rejected": 3.0950989723205566, "step": 5487 }, { "epoch": 1.21, "learning_rate": 3.527365969998013e-06, "logits/chosen": -1.7257477045059204, "logits/rejected": -1.7512117624282837, "logps/chosen": -53.98381805419922, "logps/rejected": -76.29682922363281, "loss": 1.2372, "rewards/accuracies": 0.0, "rewards/chosen": 4.993003845214844, "rewards/margins": -1.0750703811645508, "rewards/rejected": 6.0680742263793945, "step": 5488 }, { "epoch": 1.21, "learning_rate": 3.525653238477913e-06, "logits/chosen": -2.028740882873535, "logits/rejected": -1.9846667051315308, "logps/chosen": -53.693729400634766, "logps/rejected": -46.81489944458008, "loss": 0.2885, "rewards/accuracies": 1.0, "rewards/chosen": 4.027650356292725, "rewards/margins": 0.6642472743988037, "rewards/rejected": 3.363403081893921, "step": 5489 }, { "epoch": 1.22, "learning_rate": 3.5239406964078282e-06, "logits/chosen": -1.9683946371078491, "logits/rejected": -1.9876973628997803, "logps/chosen": -42.88938903808594, "logps/rejected": -93.04490661621094, "loss": 0.4637, "rewards/accuracies": 1.0, "rewards/chosen": 6.6836838722229, "rewards/margins": 0.6140198707580566, "rewards/rejected": 6.069664001464844, "step": 5490 }, { "epoch": 1.22, "learning_rate": 3.522228344007816e-06, "logits/chosen": -1.7127764225006104, "logits/rejected": -1.707431674003601, "logps/chosen": -33.321067810058594, "logps/rejected": -49.408164978027344, "loss": 1.194, "rewards/accuracies": 0.0, "rewards/chosen": 2.9255502223968506, "rewards/margins": -0.6210761070251465, "rewards/rejected": 3.546626329421997, "step": 5491 }, { "epoch": 1.22, "learning_rate": 3.5205161814979064e-06, "logits/chosen": -2.1374869346618652, "logits/rejected": -2.1107656955718994, "logps/chosen": -80.927001953125, "logps/rejected": -59.68414306640625, "loss": 0.1311, "rewards/accuracies": 1.0, "rewards/chosen": 7.521960735321045, "rewards/margins": 1.2105212211608887, "rewards/rejected": 6.311439514160156, "step": 5492 }, { "epoch": 1.22, "learning_rate": 3.5188042090981135e-06, "logits/chosen": -1.923324704170227, "logits/rejected": -1.9102624654769897, "logps/chosen": -133.82516479492188, "logps/rejected": -82.12020111083984, "loss": 0.0573, "rewards/accuracies": 1.0, "rewards/chosen": 9.071988105773926, "rewards/margins": 2.3513312339782715, "rewards/rejected": 6.720656871795654, "step": 5493 }, { "epoch": 1.22, "learning_rate": 3.5170924270284166e-06, "logits/chosen": -2.323366641998291, "logits/rejected": -2.3132760524749756, "logps/chosen": -29.99463653564453, "logps/rejected": -65.93862915039062, "loss": 0.4097, "rewards/accuracies": 0.0, "rewards/chosen": 3.2106568813323975, "rewards/margins": -0.13029098510742188, "rewards/rejected": 3.3409478664398193, "step": 5494 }, { "epoch": 1.22, "learning_rate": 3.5153808355087804e-06, "logits/chosen": -2.076503038406372, "logits/rejected": -2.072535276412964, "logps/chosen": -32.71409225463867, "logps/rejected": -49.39683151245117, "loss": 0.5219, "rewards/accuracies": 0.0, "rewards/chosen": 3.2754878997802734, "rewards/margins": -0.3110787868499756, "rewards/rejected": 3.586566686630249, "step": 5495 }, { "epoch": 1.22, "learning_rate": 3.513669434759136e-06, "logits/chosen": -1.61619234085083, "logits/rejected": -1.5792111158370972, "logps/chosen": -31.134811401367188, "logps/rejected": -41.26213836669922, "loss": 0.1707, "rewards/accuracies": 1.0, "rewards/chosen": 2.6830430030822754, "rewards/margins": 0.9002114534378052, "rewards/rejected": 1.7828315496444702, "step": 5496 }, { "epoch": 1.22, "learning_rate": 3.5119582249993977e-06, "logits/chosen": -1.8242157697677612, "logits/rejected": -1.6843702793121338, "logps/chosen": -75.95808410644531, "logps/rejected": -52.469268798828125, "loss": 0.4513, "rewards/accuracies": 1.0, "rewards/chosen": 7.068376064300537, "rewards/margins": 6.669006824493408, "rewards/rejected": 0.3993694484233856, "step": 5497 }, { "epoch": 1.22, "learning_rate": 3.51024720644945e-06, "logits/chosen": -2.105835437774658, "logits/rejected": -2.121976375579834, "logps/chosen": -49.69194793701172, "logps/rejected": -75.36125946044922, "loss": 0.5239, "rewards/accuracies": 0.0, "rewards/chosen": 4.323062419891357, "rewards/margins": -0.458096981048584, "rewards/rejected": 4.781159400939941, "step": 5498 }, { "epoch": 1.22, "learning_rate": 3.5085363793291548e-06, "logits/chosen": -1.7975009679794312, "logits/rejected": -1.8838348388671875, "logps/chosen": -38.605751037597656, "logps/rejected": -121.955078125, "loss": 1.8425, "rewards/accuracies": 0.0, "rewards/chosen": 5.41772985458374, "rewards/margins": -3.073697566986084, "rewards/rejected": 8.491427421569824, "step": 5499 }, { "epoch": 1.22, "learning_rate": 3.506825743858351e-06, "logits/chosen": -2.0134100914001465, "logits/rejected": -1.8866957426071167, "logps/chosen": -68.98739624023438, "logps/rejected": -23.396669387817383, "loss": 0.1039, "rewards/accuracies": 1.0, "rewards/chosen": 3.6046454906463623, "rewards/margins": 2.7307064533233643, "rewards/rejected": 0.8739389777183533, "step": 5500 }, { "epoch": 1.22, "learning_rate": 3.505115300256846e-06, "logits/chosen": -1.9172836542129517, "logits/rejected": -1.872397780418396, "logps/chosen": -50.624542236328125, "logps/rejected": -54.51747512817383, "loss": 0.1349, "rewards/accuracies": 1.0, "rewards/chosen": 4.2234649658203125, "rewards/margins": 1.2478220462799072, "rewards/rejected": 2.9756429195404053, "step": 5501 }, { "epoch": 1.22, "learning_rate": 3.5034050487444347e-06, "logits/chosen": -1.8546545505523682, "logits/rejected": -1.7167589664459229, "logps/chosen": -53.289710998535156, "logps/rejected": -83.32730865478516, "loss": 2.0444, "rewards/accuracies": 0.0, "rewards/chosen": 4.662613868713379, "rewards/margins": -3.974278450012207, "rewards/rejected": 8.636892318725586, "step": 5502 }, { "epoch": 1.22, "learning_rate": 3.501694989540875e-06, "logits/chosen": -1.9642884731292725, "logits/rejected": -1.9473885297775269, "logps/chosen": -27.69886016845703, "logps/rejected": -43.487728118896484, "loss": 0.568, "rewards/accuracies": 0.0, "rewards/chosen": 2.558495283126831, "rewards/margins": -0.6345729827880859, "rewards/rejected": 3.193068265914917, "step": 5503 }, { "epoch": 1.22, "learning_rate": 3.499985122865908e-06, "logits/chosen": -1.9536635875701904, "logits/rejected": -1.954361081123352, "logps/chosen": -38.6545524597168, "logps/rejected": -54.73414611816406, "loss": 0.1601, "rewards/accuracies": 1.0, "rewards/chosen": 5.404414176940918, "rewards/margins": 1.135127067565918, "rewards/rejected": 4.269287109375, "step": 5504 }, { "epoch": 1.22, "learning_rate": 3.4982754489392455e-06, "logits/chosen": -1.998210072517395, "logits/rejected": -1.947545051574707, "logps/chosen": -40.21356201171875, "logps/rejected": -51.045555114746094, "loss": 0.78, "rewards/accuracies": 1.0, "rewards/chosen": 3.2876336574554443, "rewards/margins": 1.311246633529663, "rewards/rejected": 1.9763870239257812, "step": 5505 }, { "epoch": 1.22, "learning_rate": 3.4965659679805796e-06, "logits/chosen": -1.944157600402832, "logits/rejected": -1.836635708808899, "logps/chosen": -46.44978713989258, "logps/rejected": -34.96266174316406, "loss": 0.1932, "rewards/accuracies": 1.0, "rewards/chosen": 2.4478237628936768, "rewards/margins": 0.8151344060897827, "rewards/rejected": 1.632689356803894, "step": 5506 }, { "epoch": 1.22, "learning_rate": 3.494856680209572e-06, "logits/chosen": -1.754146695137024, "logits/rejected": -1.6470667123794556, "logps/chosen": -118.2037353515625, "logps/rejected": -84.60525512695312, "loss": 0.0953, "rewards/accuracies": 1.0, "rewards/chosen": 5.84712553024292, "rewards/margins": 4.738807678222656, "rewards/rejected": 1.1083176136016846, "step": 5507 }, { "epoch": 1.22, "learning_rate": 3.4931475858458634e-06, "logits/chosen": -1.9630613327026367, "logits/rejected": -1.933875322341919, "logps/chosen": -61.463218688964844, "logps/rejected": -41.034454345703125, "loss": 0.3403, "rewards/accuracies": 1.0, "rewards/chosen": 2.9421164989471436, "rewards/margins": 0.23254990577697754, "rewards/rejected": 2.709566593170166, "step": 5508 }, { "epoch": 1.22, "learning_rate": 3.491438685109066e-06, "logits/chosen": -1.7157630920410156, "logits/rejected": -1.4859647750854492, "logps/chosen": -60.97652053833008, "logps/rejected": -45.990116119384766, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 5.786210536956787, "rewards/margins": 5.012793064117432, "rewards/rejected": 0.7734176516532898, "step": 5509 }, { "epoch": 1.22, "learning_rate": 3.4897299782187733e-06, "logits/chosen": -2.0640709400177, "logits/rejected": -2.0551369190216064, "logps/chosen": -35.888145446777344, "logps/rejected": -33.36936950683594, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": 4.671242713928223, "rewards/margins": 2.192502021789551, "rewards/rejected": 2.478740692138672, "step": 5510 }, { "epoch": 1.22, "learning_rate": 3.4880214653945472e-06, "logits/chosen": -1.755455732345581, "logits/rejected": -1.7450658082962036, "logps/chosen": -25.51373291015625, "logps/rejected": -30.13458824157715, "loss": 0.4429, "rewards/accuracies": 1.0, "rewards/chosen": 3.7954232692718506, "rewards/margins": 1.2608611583709717, "rewards/rejected": 2.534562110900879, "step": 5511 }, { "epoch": 1.22, "learning_rate": 3.4863131468559264e-06, "logits/chosen": -2.1174633502960205, "logits/rejected": -2.100302219390869, "logps/chosen": -77.31314849853516, "logps/rejected": -76.68397521972656, "loss": 0.1703, "rewards/accuracies": 1.0, "rewards/chosen": 8.272126197814941, "rewards/margins": 1.5949854850769043, "rewards/rejected": 6.677140712738037, "step": 5512 }, { "epoch": 1.22, "learning_rate": 3.4846050228224295e-06, "logits/chosen": -2.1404173374176025, "logits/rejected": -2.0660572052001953, "logps/chosen": -124.47676849365234, "logps/rejected": -102.11105346679688, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 9.661494255065918, "rewards/margins": 4.373214244842529, "rewards/rejected": 5.288280010223389, "step": 5513 }, { "epoch": 1.22, "learning_rate": 3.4828970935135426e-06, "logits/chosen": -1.7462960481643677, "logits/rejected": -1.8158336877822876, "logps/chosen": -35.62068557739258, "logps/rejected": -55.83199691772461, "loss": 0.6896, "rewards/accuracies": 0.0, "rewards/chosen": 3.4033780097961426, "rewards/margins": -1.074864387512207, "rewards/rejected": 4.47824239730835, "step": 5514 }, { "epoch": 1.22, "learning_rate": 3.481189359148733e-06, "logits/chosen": -1.903355598449707, "logits/rejected": -1.8971068859100342, "logps/chosen": -65.06000518798828, "logps/rejected": -45.83982849121094, "loss": 0.3207, "rewards/accuracies": 1.0, "rewards/chosen": 4.8102850914001465, "rewards/margins": 0.34842586517333984, "rewards/rejected": 4.461859226226807, "step": 5515 }, { "epoch": 1.22, "learning_rate": 3.479481819947439e-06, "logits/chosen": -1.7973527908325195, "logits/rejected": -1.7521189451217651, "logps/chosen": -52.65447998046875, "logps/rejected": -88.22494506835938, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": 4.8742570877075195, "rewards/margins": 1.624215841293335, "rewards/rejected": 3.2500412464141846, "step": 5516 }, { "epoch": 1.22, "learning_rate": 3.477774476129077e-06, "logits/chosen": -1.9529826641082764, "logits/rejected": -1.9193096160888672, "logps/chosen": -49.78659439086914, "logps/rejected": -38.666900634765625, "loss": 0.1907, "rewards/accuracies": 1.0, "rewards/chosen": 5.183993339538574, "rewards/margins": 1.1832304000854492, "rewards/rejected": 4.000762939453125, "step": 5517 }, { "epoch": 1.22, "learning_rate": 3.476067327913034e-06, "logits/chosen": -1.7973073720932007, "logits/rejected": -1.6404608488082886, "logps/chosen": -32.52716827392578, "logps/rejected": -70.00155639648438, "loss": 0.1588, "rewards/accuracies": 1.0, "rewards/chosen": 2.249340057373047, "rewards/margins": 1.565314531326294, "rewards/rejected": 0.6840255856513977, "step": 5518 }, { "epoch": 1.22, "learning_rate": 3.4743603755186773e-06, "logits/chosen": -2.0300869941711426, "logits/rejected": -1.747134804725647, "logps/chosen": -122.29008483886719, "logps/rejected": -103.22713470458984, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 5.25881814956665, "rewards/margins": 2.9937210083007812, "rewards/rejected": 2.265097141265869, "step": 5519 }, { "epoch": 1.22, "learning_rate": 3.472653619165343e-06, "logits/chosen": -1.7506670951843262, "logits/rejected": -1.772027611732483, "logps/chosen": -45.323055267333984, "logps/rejected": -51.110477447509766, "loss": 0.4128, "rewards/accuracies": 1.0, "rewards/chosen": 3.8924381732940674, "rewards/margins": 0.6391196250915527, "rewards/rejected": 3.2533185482025146, "step": 5520 }, { "epoch": 1.22, "learning_rate": 3.470947059072349e-06, "logits/chosen": -1.9315564632415771, "logits/rejected": -1.884490728378296, "logps/chosen": -40.53804016113281, "logps/rejected": -26.97076416015625, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 2.4822030067443848, "rewards/margins": 1.6092156171798706, "rewards/rejected": 0.8729873895645142, "step": 5521 }, { "epoch": 1.22, "learning_rate": 3.469240695458983e-06, "logits/chosen": -2.245004415512085, "logits/rejected": -2.2740602493286133, "logps/chosen": -86.70787048339844, "logps/rejected": -205.74996948242188, "loss": 0.1597, "rewards/accuracies": 1.0, "rewards/chosen": 12.305380821228027, "rewards/margins": 1.3208837509155273, "rewards/rejected": 10.9844970703125, "step": 5522 }, { "epoch": 1.22, "learning_rate": 3.467534528544506e-06, "logits/chosen": -2.1056277751922607, "logits/rejected": -2.032006025314331, "logps/chosen": -52.70535659790039, "logps/rejected": -30.167724609375, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": 3.182652711868286, "rewards/margins": 2.355800151824951, "rewards/rejected": 0.8268524408340454, "step": 5523 }, { "epoch": 1.22, "learning_rate": 3.4658285585481586e-06, "logits/chosen": -1.9715197086334229, "logits/rejected": -1.9527016878128052, "logps/chosen": -48.461097717285156, "logps/rejected": -90.43505096435547, "loss": 0.4363, "rewards/accuracies": 0.0, "rewards/chosen": 3.712904453277588, "rewards/margins": -0.32108545303344727, "rewards/rejected": 4.033989906311035, "step": 5524 }, { "epoch": 1.22, "learning_rate": 3.4641227856891523e-06, "logits/chosen": -1.5421240329742432, "logits/rejected": -1.5246816873550415, "logps/chosen": -30.844844818115234, "logps/rejected": -64.51366424560547, "loss": 0.138, "rewards/accuracies": 1.0, "rewards/chosen": 4.6023688316345215, "rewards/margins": 2.022892713546753, "rewards/rejected": 2.5794761180877686, "step": 5525 }, { "epoch": 1.22, "learning_rate": 3.4624172101866776e-06, "logits/chosen": -1.8554353713989258, "logits/rejected": -1.8491419553756714, "logps/chosen": -114.01272583007812, "logps/rejected": -122.291015625, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 8.454520225524902, "rewards/margins": 1.895958423614502, "rewards/rejected": 6.5585618019104, "step": 5526 }, { "epoch": 1.22, "learning_rate": 3.4607118322598933e-06, "logits/chosen": -2.148175001144409, "logits/rejected": -2.1123650074005127, "logps/chosen": -73.06672668457031, "logps/rejected": -39.47597122192383, "loss": 0.2541, "rewards/accuracies": 1.0, "rewards/chosen": 4.847500801086426, "rewards/margins": 1.5127630233764648, "rewards/rejected": 3.334737777709961, "step": 5527 }, { "epoch": 1.22, "learning_rate": 3.45900665212794e-06, "logits/chosen": -1.9971662759780884, "logits/rejected": -1.98030686378479, "logps/chosen": -35.214176177978516, "logps/rejected": -57.688148498535156, "loss": 0.5026, "rewards/accuracies": 0.0, "rewards/chosen": 4.172723293304443, "rewards/margins": -0.3553619384765625, "rewards/rejected": 4.528085231781006, "step": 5528 }, { "epoch": 1.22, "learning_rate": 3.4573016700099254e-06, "logits/chosen": -1.9866429567337036, "logits/rejected": -1.950896978378296, "logps/chosen": -78.33171081542969, "logps/rejected": -60.947998046875, "loss": 0.0504, "rewards/accuracies": 1.0, "rewards/chosen": 5.442051887512207, "rewards/margins": 2.863787889480591, "rewards/rejected": 2.578263998031616, "step": 5529 }, { "epoch": 1.22, "learning_rate": 3.4555968861249403e-06, "logits/chosen": -1.855022668838501, "logits/rejected": -1.8307234048843384, "logps/chosen": -170.19735717773438, "logps/rejected": -91.34273529052734, "loss": 0.0905, "rewards/accuracies": 1.0, "rewards/chosen": 7.005059719085693, "rewards/margins": 1.6283836364746094, "rewards/rejected": 5.376676082611084, "step": 5530 }, { "epoch": 1.22, "learning_rate": 3.4538923006920417e-06, "logits/chosen": -2.0350072383880615, "logits/rejected": -1.567335844039917, "logps/chosen": -62.9383659362793, "logps/rejected": -42.88859939575195, "loss": 0.4035, "rewards/accuracies": 0.0, "rewards/chosen": 3.7000935077667236, "rewards/margins": -0.1272132396697998, "rewards/rejected": 3.8273067474365234, "step": 5531 }, { "epoch": 1.22, "learning_rate": 3.4521879139302648e-06, "logits/chosen": -1.652289628982544, "logits/rejected": -1.666566252708435, "logps/chosen": -38.371070861816406, "logps/rejected": -66.60761260986328, "loss": 0.6257, "rewards/accuracies": 1.0, "rewards/chosen": 4.758003234863281, "rewards/margins": 0.12108993530273438, "rewards/rejected": 4.636913299560547, "step": 5532 }, { "epoch": 1.22, "learning_rate": 3.450483726058622e-06, "logits/chosen": -1.9992064237594604, "logits/rejected": -1.9868041276931763, "logps/chosen": -40.20079803466797, "logps/rejected": -62.559478759765625, "loss": 0.4925, "rewards/accuracies": 1.0, "rewards/chosen": 4.245212554931641, "rewards/margins": 1.3600447177886963, "rewards/rejected": 2.8851678371429443, "step": 5533 }, { "epoch": 1.22, "learning_rate": 3.448779737296093e-06, "logits/chosen": -1.8985366821289062, "logits/rejected": -1.8664909601211548, "logps/chosen": -41.80359649658203, "logps/rejected": -41.44244384765625, "loss": 0.6002, "rewards/accuracies": 1.0, "rewards/chosen": 6.148255825042725, "rewards/margins": 1.7444686889648438, "rewards/rejected": 4.403787136077881, "step": 5534 }, { "epoch": 1.23, "learning_rate": 3.4470759478616405e-06, "logits/chosen": -1.873712420463562, "logits/rejected": -1.8755128383636475, "logps/chosen": -87.18421936035156, "logps/rejected": -101.86241912841797, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": 5.47323751449585, "rewards/margins": 1.3846564292907715, "rewards/rejected": 4.088581085205078, "step": 5535 }, { "epoch": 1.23, "learning_rate": 3.445372357974194e-06, "logits/chosen": -1.8575645685195923, "logits/rejected": -1.7728419303894043, "logps/chosen": -24.003131866455078, "logps/rejected": -51.09988784790039, "loss": 0.3201, "rewards/accuracies": 1.0, "rewards/chosen": 3.0060794353485107, "rewards/margins": 0.540496826171875, "rewards/rejected": 2.4655826091766357, "step": 5536 }, { "epoch": 1.23, "learning_rate": 3.4436689678526635e-06, "logits/chosen": -1.9140950441360474, "logits/rejected": -1.7799237966537476, "logps/chosen": -52.912925720214844, "logps/rejected": -20.186763763427734, "loss": 0.1178, "rewards/accuracies": 1.0, "rewards/chosen": 3.3008575439453125, "rewards/margins": 1.729527235031128, "rewards/rejected": 1.5713303089141846, "step": 5537 }, { "epoch": 1.23, "learning_rate": 3.4419657777159276e-06, "logits/chosen": -2.1192147731781006, "logits/rejected": -1.498024344444275, "logps/chosen": -173.8563232421875, "logps/rejected": -63.557003021240234, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": 7.8909759521484375, "rewards/margins": 5.565601348876953, "rewards/rejected": 2.3253743648529053, "step": 5538 }, { "epoch": 1.23, "learning_rate": 3.4402627877828466e-06, "logits/chosen": -1.7443429231643677, "logits/rejected": -1.7271302938461304, "logps/chosen": -50.76823043823242, "logps/rejected": -98.80860137939453, "loss": 0.2127, "rewards/accuracies": 1.0, "rewards/chosen": 4.245167255401611, "rewards/margins": 0.9713551998138428, "rewards/rejected": 3.2738120555877686, "step": 5539 }, { "epoch": 1.23, "learning_rate": 3.438559998272246e-06, "logits/chosen": -1.7464045286178589, "logits/rejected": -1.7239933013916016, "logps/chosen": -40.333709716796875, "logps/rejected": -31.010629653930664, "loss": 0.1647, "rewards/accuracies": 1.0, "rewards/chosen": 3.2548370361328125, "rewards/margins": 0.9450523853302002, "rewards/rejected": 2.3097846508026123, "step": 5540 }, { "epoch": 1.23, "learning_rate": 3.436857409402934e-06, "logits/chosen": -1.7483566999435425, "logits/rejected": -1.647496223449707, "logps/chosen": -92.35460662841797, "logps/rejected": -82.34751892089844, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 7.224289894104004, "rewards/margins": 4.427744388580322, "rewards/rejected": 2.7965455055236816, "step": 5541 }, { "epoch": 1.23, "learning_rate": 3.4351550213936865e-06, "logits/chosen": -2.0157036781311035, "logits/rejected": -2.0303194522857666, "logps/chosen": -82.592529296875, "logps/rejected": -67.27999877929688, "loss": 0.6052, "rewards/accuracies": 1.0, "rewards/chosen": 3.625135898590088, "rewards/margins": 2.0239624977111816, "rewards/rejected": 1.6011734008789062, "step": 5542 }, { "epoch": 1.23, "learning_rate": 3.4334528344632546e-06, "logits/chosen": -1.946797251701355, "logits/rejected": -1.8943043947219849, "logps/chosen": -25.901500701904297, "logps/rejected": -66.91735076904297, "loss": 0.9139, "rewards/accuracies": 0.0, "rewards/chosen": 3.406087875366211, "rewards/margins": -1.6488475799560547, "rewards/rejected": 5.054935455322266, "step": 5543 }, { "epoch": 1.23, "learning_rate": 3.431750848830371e-06, "logits/chosen": -2.0321054458618164, "logits/rejected": -1.899548888206482, "logps/chosen": -195.75439453125, "logps/rejected": -74.05191040039062, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 7.353946208953857, "rewards/margins": 3.378584623336792, "rewards/rejected": 3.9753615856170654, "step": 5544 }, { "epoch": 1.23, "learning_rate": 3.4300490647137297e-06, "logits/chosen": -1.9513179063796997, "logits/rejected": -1.8651033639907837, "logps/chosen": -72.1581802368164, "logps/rejected": -52.51166534423828, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 8.158863067626953, "rewards/margins": 3.284945011138916, "rewards/rejected": 4.873918056488037, "step": 5545 }, { "epoch": 1.23, "learning_rate": 3.4283474823320123e-06, "logits/chosen": -1.9618133306503296, "logits/rejected": -1.9439074993133545, "logps/chosen": -40.957977294921875, "logps/rejected": -40.87945556640625, "loss": 0.8412, "rewards/accuracies": 0.0, "rewards/chosen": 3.856722354888916, "rewards/margins": -1.4135618209838867, "rewards/rejected": 5.270284175872803, "step": 5546 }, { "epoch": 1.23, "learning_rate": 3.4266461019038644e-06, "logits/chosen": -2.088423728942871, "logits/rejected": -2.0427539348602295, "logps/chosen": -141.79173278808594, "logps/rejected": -115.50968933105469, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": 9.58696174621582, "rewards/margins": 3.5316271781921387, "rewards/rejected": 6.055334568023682, "step": 5547 }, { "epoch": 1.23, "learning_rate": 3.42494492364791e-06, "logits/chosen": -1.6678087711334229, "logits/rejected": -1.6678087711334229, "logps/chosen": -21.642763137817383, "logps/rejected": -21.642763137817383, "loss": 0.4058, "rewards/accuracies": 0.0, "rewards/chosen": 1.9547895193099976, "rewards/margins": 0.0, "rewards/rejected": 1.9547895193099976, "step": 5548 }, { "epoch": 1.23, "learning_rate": 3.423243947782746e-06, "logits/chosen": -1.9377672672271729, "logits/rejected": -1.8180246353149414, "logps/chosen": -56.941558837890625, "logps/rejected": -23.41522979736328, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 2.871934652328491, "rewards/margins": 2.1267404556274414, "rewards/rejected": 0.7451942563056946, "step": 5549 }, { "epoch": 1.23, "learning_rate": 3.4215431745269463e-06, "logits/chosen": -2.0881760120391846, "logits/rejected": -2.1340625286102295, "logps/chosen": -84.10266876220703, "logps/rejected": -101.6868896484375, "loss": 1.3377, "rewards/accuracies": 0.0, "rewards/chosen": 4.559340953826904, "rewards/margins": -2.6011009216308594, "rewards/rejected": 7.160441875457764, "step": 5550 }, { "epoch": 1.23, "learning_rate": 3.419842604099054e-06, "logits/chosen": -1.9513049125671387, "logits/rejected": -1.9641040563583374, "logps/chosen": -48.38148498535156, "logps/rejected": -59.552947998046875, "loss": 0.3666, "rewards/accuracies": 1.0, "rewards/chosen": 3.815776824951172, "rewards/margins": 0.8395826816558838, "rewards/rejected": 2.976194143295288, "step": 5551 }, { "epoch": 1.23, "learning_rate": 3.418142236717586e-06, "logits/chosen": -2.0088493824005127, "logits/rejected": -1.9447133541107178, "logps/chosen": -117.93434143066406, "logps/rejected": -54.87401580810547, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 7.007096767425537, "rewards/margins": 4.8278045654296875, "rewards/rejected": 2.1792924404144287, "step": 5552 }, { "epoch": 1.23, "learning_rate": 3.41644207260104e-06, "logits/chosen": -1.7503288984298706, "logits/rejected": -1.7685816287994385, "logps/chosen": -52.83639907836914, "logps/rejected": -96.35768127441406, "loss": 0.9791, "rewards/accuracies": 0.0, "rewards/chosen": 4.959383964538574, "rewards/margins": -1.6216816902160645, "rewards/rejected": 6.581065654754639, "step": 5553 }, { "epoch": 1.23, "learning_rate": 3.4147421119678792e-06, "logits/chosen": -1.9042938947677612, "logits/rejected": -1.9050464630126953, "logps/chosen": -93.06714630126953, "logps/rejected": -81.52692413330078, "loss": 0.0965, "rewards/accuracies": 1.0, "rewards/chosen": 9.819703102111816, "rewards/margins": 2.0535659790039062, "rewards/rejected": 7.76613712310791, "step": 5554 }, { "epoch": 1.23, "learning_rate": 3.4130423550365476e-06, "logits/chosen": -1.8871065378189087, "logits/rejected": -1.8976552486419678, "logps/chosen": -12.93636703491211, "logps/rejected": -35.73301315307617, "loss": 0.731, "rewards/accuracies": 0.0, "rewards/chosen": 1.5821809768676758, "rewards/margins": -1.0401122570037842, "rewards/rejected": 2.62229323387146, "step": 5555 }, { "epoch": 1.23, "learning_rate": 3.411342802025458e-06, "logits/chosen": -2.107922077178955, "logits/rejected": -2.12589168548584, "logps/chosen": -48.815643310546875, "logps/rejected": -37.43360900878906, "loss": 0.4191, "rewards/accuracies": 0.0, "rewards/chosen": 3.963275194168091, "rewards/margins": -0.02268362045288086, "rewards/rejected": 3.9859588146209717, "step": 5556 }, { "epoch": 1.23, "learning_rate": 3.4096434531529986e-06, "logits/chosen": -2.1358754634857178, "logits/rejected": -2.119257688522339, "logps/chosen": -52.507530212402344, "logps/rejected": -63.04145050048828, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 7.126271724700928, "rewards/margins": 3.49654221534729, "rewards/rejected": 3.6297295093536377, "step": 5557 }, { "epoch": 1.23, "learning_rate": 3.407944308637533e-06, "logits/chosen": -1.8816649913787842, "logits/rejected": -1.5655336380004883, "logps/chosen": -46.042259216308594, "logps/rejected": -43.584747314453125, "loss": 0.2082, "rewards/accuracies": 1.0, "rewards/chosen": 3.9323806762695312, "rewards/margins": 0.7585632801055908, "rewards/rejected": 3.1738173961639404, "step": 5558 }, { "epoch": 1.23, "learning_rate": 3.4062453686973977e-06, "logits/chosen": -1.8000156879425049, "logits/rejected": -1.0394220352172852, "logps/chosen": -48.24983215332031, "logps/rejected": -76.16588592529297, "loss": 1.0056, "rewards/accuracies": 0.0, "rewards/chosen": 2.724703311920166, "rewards/margins": -1.323408603668213, "rewards/rejected": 4.048111915588379, "step": 5559 }, { "epoch": 1.23, "learning_rate": 3.404546633550899e-06, "logits/chosen": -1.8108645677566528, "logits/rejected": -1.8108645677566528, "logps/chosen": -31.477577209472656, "logps/rejected": -31.477577209472656, "loss": 0.6184, "rewards/accuracies": 0.0, "rewards/chosen": 3.8165671825408936, "rewards/margins": 0.0, "rewards/rejected": 3.8165671825408936, "step": 5560 }, { "epoch": 1.23, "learning_rate": 3.4028481034163247e-06, "logits/chosen": -1.9464359283447266, "logits/rejected": -1.9682388305664062, "logps/chosen": -54.664024353027344, "logps/rejected": -110.62554931640625, "loss": 0.8186, "rewards/accuracies": 1.0, "rewards/chosen": 7.489893436431885, "rewards/margins": 0.14517593383789062, "rewards/rejected": 7.344717502593994, "step": 5561 }, { "epoch": 1.23, "learning_rate": 3.4011497785119296e-06, "logits/chosen": -2.0665221214294434, "logits/rejected": -2.0911307334899902, "logps/chosen": -37.0869140625, "logps/rejected": -75.61575317382812, "loss": 0.1927, "rewards/accuracies": 1.0, "rewards/chosen": 2.1993672847747803, "rewards/margins": 1.1990437507629395, "rewards/rejected": 1.0003235340118408, "step": 5562 }, { "epoch": 1.23, "learning_rate": 3.399451659055942e-06, "logits/chosen": -1.8554073572158813, "logits/rejected": -1.8554073572158813, "logps/chosen": -55.95710754394531, "logps/rejected": -55.95710754394531, "loss": 0.7086, "rewards/accuracies": 0.0, "rewards/chosen": 4.004265785217285, "rewards/margins": 0.0, "rewards/rejected": 4.004265785217285, "step": 5563 }, { "epoch": 1.23, "learning_rate": 3.397753745266571e-06, "logits/chosen": -1.7655173540115356, "logits/rejected": -1.7452744245529175, "logps/chosen": -96.9944076538086, "logps/rejected": -53.2346076965332, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 7.393717288970947, "rewards/margins": 2.8622798919677734, "rewards/rejected": 4.531437397003174, "step": 5564 }, { "epoch": 1.23, "learning_rate": 3.396056037361991e-06, "logits/chosen": -1.8227559328079224, "logits/rejected": -1.6908992528915405, "logps/chosen": -141.195068359375, "logps/rejected": -55.74385070800781, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 8.160967826843262, "rewards/margins": 4.628965377807617, "rewards/rejected": 3.5320022106170654, "step": 5565 }, { "epoch": 1.23, "learning_rate": 3.394358535560355e-06, "logits/chosen": -1.5467177629470825, "logits/rejected": -1.5181277990341187, "logps/chosen": -73.98429870605469, "logps/rejected": -49.130271911621094, "loss": 0.4666, "rewards/accuracies": 0.0, "rewards/chosen": 4.174643039703369, "rewards/margins": -0.26352787017822266, "rewards/rejected": 4.438170909881592, "step": 5566 }, { "epoch": 1.23, "learning_rate": 3.392661240079786e-06, "logits/chosen": -2.31210994720459, "logits/rejected": -2.3344197273254395, "logps/chosen": -77.01348876953125, "logps/rejected": -207.53518676757812, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 10.727627754211426, "rewards/margins": 4.058416843414307, "rewards/rejected": 6.669210910797119, "step": 5567 }, { "epoch": 1.23, "learning_rate": 3.390964151138385e-06, "logits/chosen": -1.7468663454055786, "logits/rejected": -1.7315216064453125, "logps/chosen": -85.5873031616211, "logps/rejected": -64.65528869628906, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 6.034759044647217, "rewards/margins": 3.1166200637817383, "rewards/rejected": 2.9181389808654785, "step": 5568 }, { "epoch": 1.23, "learning_rate": 3.389267268954221e-06, "logits/chosen": -1.9625239372253418, "logits/rejected": -1.9229191541671753, "logps/chosen": -39.16680908203125, "logps/rejected": -53.76533508300781, "loss": 0.2462, "rewards/accuracies": 1.0, "rewards/chosen": 3.6504623889923096, "rewards/margins": 0.454254150390625, "rewards/rejected": 3.1962082386016846, "step": 5569 }, { "epoch": 1.23, "learning_rate": 3.3875705937453428e-06, "logits/chosen": -2.2499682903289795, "logits/rejected": -2.2107958793640137, "logps/chosen": -122.15864562988281, "logps/rejected": -71.84484100341797, "loss": 0.0694, "rewards/accuracies": 1.0, "rewards/chosen": 6.291652202606201, "rewards/margins": 1.9481630325317383, "rewards/rejected": 4.343489170074463, "step": 5570 }, { "epoch": 1.23, "learning_rate": 3.3858741257297656e-06, "logits/chosen": -2.3665335178375244, "logits/rejected": -2.3782153129577637, "logps/chosen": -69.68678283691406, "logps/rejected": -208.17578125, "loss": 0.3811, "rewards/accuracies": 1.0, "rewards/chosen": 11.462440490722656, "rewards/margins": 2.638474464416504, "rewards/rejected": 8.823966026306152, "step": 5571 }, { "epoch": 1.23, "learning_rate": 3.3841778651254843e-06, "logits/chosen": -2.0721030235290527, "logits/rejected": -2.0668482780456543, "logps/chosen": -46.893043518066406, "logps/rejected": -46.17448806762695, "loss": 0.2487, "rewards/accuracies": 1.0, "rewards/chosen": 3.7729415893554688, "rewards/margins": 0.4766552448272705, "rewards/rejected": 3.2962863445281982, "step": 5572 }, { "epoch": 1.23, "learning_rate": 3.3824818121504634e-06, "logits/chosen": -2.1079750061035156, "logits/rejected": -2.066133975982666, "logps/chosen": -110.05860900878906, "logps/rejected": -121.40685272216797, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 8.758857727050781, "rewards/margins": 3.68772029876709, "rewards/rejected": 5.071137428283691, "step": 5573 }, { "epoch": 1.23, "learning_rate": 3.3807859670226394e-06, "logits/chosen": -2.043077230453491, "logits/rejected": -2.0474047660827637, "logps/chosen": -39.334693908691406, "logps/rejected": -53.390872955322266, "loss": 0.5655, "rewards/accuracies": 0.0, "rewards/chosen": 3.4037399291992188, "rewards/margins": -0.7044796943664551, "rewards/rejected": 4.108219623565674, "step": 5574 }, { "epoch": 1.23, "learning_rate": 3.379090329959928e-06, "logits/chosen": -2.0572524070739746, "logits/rejected": -2.0181820392608643, "logps/chosen": -66.58729553222656, "logps/rejected": -63.02313232421875, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 4.144256591796875, "rewards/margins": 2.0285720825195312, "rewards/rejected": 2.1156845092773438, "step": 5575 }, { "epoch": 1.23, "learning_rate": 3.377394901180211e-06, "logits/chosen": -2.124809741973877, "logits/rejected": -2.1500637531280518, "logps/chosen": -73.0190200805664, "logps/rejected": -56.41802978515625, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": 6.630259037017822, "rewards/margins": 2.357131004333496, "rewards/rejected": 4.273128032684326, "step": 5576 }, { "epoch": 1.23, "learning_rate": 3.3756996809013496e-06, "logits/chosen": -1.4020812511444092, "logits/rejected": -1.3038063049316406, "logps/chosen": -34.43900680541992, "logps/rejected": -9.565765380859375, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 3.3477025032043457, "rewards/margins": 2.0413970947265625, "rewards/rejected": 1.3063052892684937, "step": 5577 }, { "epoch": 1.23, "learning_rate": 3.374004669341173e-06, "logits/chosen": -1.9529762268066406, "logits/rejected": -1.9762611389160156, "logps/chosen": -39.61119842529297, "logps/rejected": -80.02851867675781, "loss": 0.3782, "rewards/accuracies": 1.0, "rewards/chosen": 3.9512383937835693, "rewards/margins": 0.23492145538330078, "rewards/rejected": 3.7163169384002686, "step": 5578 }, { "epoch": 1.23, "learning_rate": 3.3723098667174893e-06, "logits/chosen": -1.966072916984558, "logits/rejected": -1.9453601837158203, "logps/chosen": -34.36290740966797, "logps/rejected": -34.54493713378906, "loss": 0.9561, "rewards/accuracies": 0.0, "rewards/chosen": 2.82200288772583, "rewards/margins": -1.7409253120422363, "rewards/rejected": 4.562928199768066, "step": 5579 }, { "epoch": 1.24, "learning_rate": 3.3706152732480733e-06, "logits/chosen": -2.0874223709106445, "logits/rejected": -2.0491111278533936, "logps/chosen": -66.22577667236328, "logps/rejected": -36.526451110839844, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 6.041359901428223, "rewards/margins": 2.4654381275177, "rewards/rejected": 3.5759217739105225, "step": 5580 }, { "epoch": 1.24, "learning_rate": 3.36892088915068e-06, "logits/chosen": -2.023404598236084, "logits/rejected": -2.0372328758239746, "logps/chosen": -64.03666687011719, "logps/rejected": -128.3251953125, "loss": 0.4181, "rewards/accuracies": 1.0, "rewards/chosen": 8.619292259216309, "rewards/margins": 0.7481756210327148, "rewards/rejected": 7.871116638183594, "step": 5581 }, { "epoch": 1.24, "learning_rate": 3.3672267146430304e-06, "logits/chosen": -1.9576748609542847, "logits/rejected": -1.9944286346435547, "logps/chosen": -92.08125305175781, "logps/rejected": -142.92446899414062, "loss": 0.3267, "rewards/accuracies": 1.0, "rewards/chosen": 10.073829650878906, "rewards/margins": 1.9912729263305664, "rewards/rejected": 8.08255672454834, "step": 5582 }, { "epoch": 1.24, "learning_rate": 3.3655327499428225e-06, "logits/chosen": -2.4033915996551514, "logits/rejected": -2.325523614883423, "logps/chosen": -103.45951843261719, "logps/rejected": -147.64743041992188, "loss": 0.1568, "rewards/accuracies": 1.0, "rewards/chosen": 10.352736473083496, "rewards/margins": 2.1165637969970703, "rewards/rejected": 8.236172676086426, "step": 5583 }, { "epoch": 1.24, "learning_rate": 3.363838995267728e-06, "logits/chosen": -2.139604091644287, "logits/rejected": -2.1275076866149902, "logps/chosen": -89.4612045288086, "logps/rejected": -78.4925765991211, "loss": 0.0541, "rewards/accuracies": 1.0, "rewards/chosen": 9.224292755126953, "rewards/margins": 2.8985595703125, "rewards/rejected": 6.325733184814453, "step": 5584 }, { "epoch": 1.24, "learning_rate": 3.3621454508353874e-06, "logits/chosen": -1.9441592693328857, "logits/rejected": -1.8277257680892944, "logps/chosen": -99.82457733154297, "logps/rejected": -45.7458610534668, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": 5.8785834312438965, "rewards/margins": 3.5005099773406982, "rewards/rejected": 2.3780734539031982, "step": 5585 }, { "epoch": 1.24, "learning_rate": 3.360452116863421e-06, "logits/chosen": -1.9242230653762817, "logits/rejected": -1.9088002443313599, "logps/chosen": -49.31889343261719, "logps/rejected": -55.805328369140625, "loss": 0.4357, "rewards/accuracies": 0.0, "rewards/chosen": 4.6044602394104, "rewards/margins": -0.3126206398010254, "rewards/rejected": 4.917080879211426, "step": 5586 }, { "epoch": 1.24, "learning_rate": 3.3587589935694142e-06, "logits/chosen": -1.7407443523406982, "logits/rejected": -1.7407443523406982, "logps/chosen": -38.756317138671875, "logps/rejected": -38.756317138671875, "loss": 0.3687, "rewards/accuracies": 0.0, "rewards/chosen": 3.988482713699341, "rewards/margins": 0.0, "rewards/rejected": 3.988482713699341, "step": 5587 }, { "epoch": 1.24, "learning_rate": 3.357066081170934e-06, "logits/chosen": -1.985702395439148, "logits/rejected": -2.0069169998168945, "logps/chosen": -41.31001281738281, "logps/rejected": -40.558204650878906, "loss": 0.2688, "rewards/accuracies": 1.0, "rewards/chosen": 3.280848741531372, "rewards/margins": 0.8376426696777344, "rewards/rejected": 2.4432060718536377, "step": 5588 }, { "epoch": 1.24, "learning_rate": 3.355373379885511e-06, "logits/chosen": -1.8611869812011719, "logits/rejected": -1.8102343082427979, "logps/chosen": -148.20509338378906, "logps/rejected": -62.258216857910156, "loss": 0.139, "rewards/accuracies": 1.0, "rewards/chosen": 7.33392333984375, "rewards/margins": 1.817594051361084, "rewards/rejected": 5.516329288482666, "step": 5589 }, { "epoch": 1.24, "learning_rate": 3.3536808899306548e-06, "logits/chosen": -2.3123092651367188, "logits/rejected": -2.3031439781188965, "logps/chosen": -90.88206481933594, "logps/rejected": -36.72471618652344, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 5.796112060546875, "rewards/margins": 5.288628578186035, "rewards/rejected": 0.5074836611747742, "step": 5590 }, { "epoch": 1.24, "learning_rate": 3.3519886115238477e-06, "logits/chosen": -2.0219459533691406, "logits/rejected": -2.0268771648406982, "logps/chosen": -38.38214111328125, "logps/rejected": -65.82542419433594, "loss": 0.3178, "rewards/accuracies": 1.0, "rewards/chosen": 4.545202732086182, "rewards/margins": 0.4087514877319336, "rewards/rejected": 4.136451244354248, "step": 5591 }, { "epoch": 1.24, "learning_rate": 3.350296544882543e-06, "logits/chosen": -1.6628222465515137, "logits/rejected": -1.5441974401474, "logps/chosen": -47.95001220703125, "logps/rejected": -28.458988189697266, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 4.747808933258057, "rewards/margins": 3.63371205329895, "rewards/rejected": 1.1140968799591064, "step": 5592 }, { "epoch": 1.24, "learning_rate": 3.3486046902241663e-06, "logits/chosen": -2.134979724884033, "logits/rejected": -2.1199307441711426, "logps/chosen": -106.7912826538086, "logps/rejected": -132.74435424804688, "loss": 0.6234, "rewards/accuracies": 0.0, "rewards/chosen": 8.536039352416992, "rewards/margins": -0.9078130722045898, "rewards/rejected": 9.443852424621582, "step": 5593 }, { "epoch": 1.24, "learning_rate": 3.3469130477661147e-06, "logits/chosen": -2.101870059967041, "logits/rejected": -2.0665571689605713, "logps/chosen": -93.98432159423828, "logps/rejected": -103.49699401855469, "loss": 0.103, "rewards/accuracies": 1.0, "rewards/chosen": 7.452938079833984, "rewards/margins": 3.0429940223693848, "rewards/rejected": 4.4099440574646, "step": 5594 }, { "epoch": 1.24, "learning_rate": 3.345221617725765e-06, "logits/chosen": -2.0532774925231934, "logits/rejected": -2.0203840732574463, "logps/chosen": -84.91384887695312, "logps/rejected": -39.173763275146484, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 4.761003017425537, "rewards/margins": 2.6604020595550537, "rewards/rejected": 2.1006009578704834, "step": 5595 }, { "epoch": 1.24, "learning_rate": 3.343530400320457e-06, "logits/chosen": -1.5628634691238403, "logits/rejected": -1.579092025756836, "logps/chosen": -32.07798767089844, "logps/rejected": -56.217552185058594, "loss": 0.2883, "rewards/accuracies": 1.0, "rewards/chosen": 4.306988716125488, "rewards/margins": 0.25507497787475586, "rewards/rejected": 4.051913738250732, "step": 5596 }, { "epoch": 1.24, "learning_rate": 3.3418393957675133e-06, "logits/chosen": -2.078272819519043, "logits/rejected": -2.0619072914123535, "logps/chosen": -58.793601989746094, "logps/rejected": -89.36873626708984, "loss": 0.1727, "rewards/accuracies": 1.0, "rewards/chosen": 9.166155815124512, "rewards/margins": 1.0337848663330078, "rewards/rejected": 8.132370948791504, "step": 5597 }, { "epoch": 1.24, "learning_rate": 3.3401486042842203e-06, "logits/chosen": -2.125480890274048, "logits/rejected": -2.0651049613952637, "logps/chosen": -62.885215759277344, "logps/rejected": -43.59394073486328, "loss": 0.1851, "rewards/accuracies": 1.0, "rewards/chosen": 4.399118900299072, "rewards/margins": 1.1182045936584473, "rewards/rejected": 3.280914306640625, "step": 5598 }, { "epoch": 1.24, "learning_rate": 3.338458026087842e-06, "logits/chosen": -2.080087423324585, "logits/rejected": -2.0080270767211914, "logps/chosen": -62.51218032836914, "logps/rejected": -17.89826202392578, "loss": 0.1541, "rewards/accuracies": 1.0, "rewards/chosen": 1.3522891998291016, "rewards/margins": 1.0361477136611938, "rewards/rejected": 0.3161415159702301, "step": 5599 }, { "epoch": 1.24, "learning_rate": 3.33676766139561e-06, "logits/chosen": -1.8716436624526978, "logits/rejected": -1.803868055343628, "logps/chosen": -86.1901626586914, "logps/rejected": -57.76154708862305, "loss": 0.7041, "rewards/accuracies": 1.0, "rewards/chosen": 4.9221720695495605, "rewards/margins": 0.8389682769775391, "rewards/rejected": 4.0832037925720215, "step": 5600 }, { "epoch": 1.24, "learning_rate": 3.335077510424739e-06, "logits/chosen": -2.2098467350006104, "logits/rejected": -2.2334158420562744, "logps/chosen": -62.40269088745117, "logps/rejected": -62.49064636230469, "loss": 0.1458, "rewards/accuracies": 1.0, "rewards/chosen": 5.60873556137085, "rewards/margins": 2.09804368019104, "rewards/rejected": 3.5106918811798096, "step": 5601 }, { "epoch": 1.24, "learning_rate": 3.333387573392405e-06, "logits/chosen": -1.8681753873825073, "logits/rejected": -1.8543401956558228, "logps/chosen": -57.16849899291992, "logps/rejected": -55.53925323486328, "loss": 0.1857, "rewards/accuracies": 1.0, "rewards/chosen": 3.5503575801849365, "rewards/margins": 1.2823331356048584, "rewards/rejected": 2.268024444580078, "step": 5602 }, { "epoch": 1.24, "learning_rate": 3.331697850515758e-06, "logits/chosen": -1.8599467277526855, "logits/rejected": -1.767485499382019, "logps/chosen": -88.25953674316406, "logps/rejected": -79.4005126953125, "loss": 0.1304, "rewards/accuracies": 1.0, "rewards/chosen": 6.034156799316406, "rewards/margins": 3.7604010105133057, "rewards/rejected": 2.2737557888031006, "step": 5603 }, { "epoch": 1.24, "learning_rate": 3.3300083420119307e-06, "logits/chosen": -1.8527312278747559, "logits/rejected": -1.8084896802902222, "logps/chosen": -50.15796661376953, "logps/rejected": -41.47690963745117, "loss": 0.0774, "rewards/accuracies": 1.0, "rewards/chosen": 4.840232849121094, "rewards/margins": 1.7918179035186768, "rewards/rejected": 3.048414945602417, "step": 5604 }, { "epoch": 1.24, "learning_rate": 3.3283190480980136e-06, "logits/chosen": -1.9920850992202759, "logits/rejected": -1.9395643472671509, "logps/chosen": -83.48439025878906, "logps/rejected": -70.6489028930664, "loss": 0.1095, "rewards/accuracies": 1.0, "rewards/chosen": 6.014839172363281, "rewards/margins": 1.7127571105957031, "rewards/rejected": 4.302082061767578, "step": 5605 }, { "epoch": 1.24, "learning_rate": 3.326629968991083e-06, "logits/chosen": -1.8739402294158936, "logits/rejected": -1.8658511638641357, "logps/chosen": -37.05678939819336, "logps/rejected": -59.09638214111328, "loss": 0.5228, "rewards/accuracies": 1.0, "rewards/chosen": 2.4275176525115967, "rewards/margins": 0.10143494606018066, "rewards/rejected": 2.326082706451416, "step": 5606 }, { "epoch": 1.24, "learning_rate": 3.324941104908177e-06, "logits/chosen": -2.1520469188690186, "logits/rejected": -2.1010818481445312, "logps/chosen": -52.47944259643555, "logps/rejected": -77.9716567993164, "loss": 0.0945, "rewards/accuracies": 1.0, "rewards/chosen": 5.6233038902282715, "rewards/margins": 1.9407072067260742, "rewards/rejected": 3.6825966835021973, "step": 5607 }, { "epoch": 1.24, "learning_rate": 3.3232524560663137e-06, "logits/chosen": -2.0029654502868652, "logits/rejected": -1.9101707935333252, "logps/chosen": -97.91192626953125, "logps/rejected": -51.44794464111328, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 7.827282905578613, "rewards/margins": 4.513811111450195, "rewards/rejected": 3.313472032546997, "step": 5608 }, { "epoch": 1.24, "learning_rate": 3.3215640226824776e-06, "logits/chosen": -2.064988374710083, "logits/rejected": -2.124898672103882, "logps/chosen": -27.99596405029297, "logps/rejected": -69.04047393798828, "loss": 1.6563, "rewards/accuracies": 0.0, "rewards/chosen": 3.390619993209839, "rewards/margins": -3.2247512340545654, "rewards/rejected": 6.615371227264404, "step": 5609 }, { "epoch": 1.24, "learning_rate": 3.3198758049736314e-06, "logits/chosen": -1.9852548837661743, "logits/rejected": -1.987318992614746, "logps/chosen": -36.970848083496094, "logps/rejected": -64.56857299804688, "loss": 0.2463, "rewards/accuracies": 1.0, "rewards/chosen": 3.627340078353882, "rewards/margins": 0.9987053871154785, "rewards/rejected": 2.6286346912384033, "step": 5610 }, { "epoch": 1.24, "learning_rate": 3.3181878031567025e-06, "logits/chosen": -1.9313633441925049, "logits/rejected": -1.9369962215423584, "logps/chosen": -44.81766128540039, "logps/rejected": -48.50891876220703, "loss": 0.3995, "rewards/accuracies": 0.0, "rewards/chosen": 3.522125005722046, "rewards/margins": -0.15927839279174805, "rewards/rejected": 3.681403398513794, "step": 5611 }, { "epoch": 1.24, "learning_rate": 3.316500017448601e-06, "logits/chosen": -1.565328598022461, "logits/rejected": -1.5244359970092773, "logps/chosen": -48.461219787597656, "logps/rejected": -81.53492736816406, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 3.523303985595703, "rewards/margins": 1.7610816955566406, "rewards/rejected": 1.7622222900390625, "step": 5612 }, { "epoch": 1.24, "learning_rate": 3.3148124480661993e-06, "logits/chosen": -1.7744166851043701, "logits/rejected": -1.796744465827942, "logps/chosen": -20.763839721679688, "logps/rejected": -39.05431365966797, "loss": 0.8834, "rewards/accuracies": 0.0, "rewards/chosen": 2.2511067390441895, "rewards/margins": -0.8943653106689453, "rewards/rejected": 3.1454720497131348, "step": 5613 }, { "epoch": 1.24, "learning_rate": 3.3131250952263456e-06, "logits/chosen": -1.8189525604248047, "logits/rejected": -1.7609679698944092, "logps/chosen": -114.83631896972656, "logps/rejected": -113.18170166015625, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 8.76426887512207, "rewards/margins": 4.670799732208252, "rewards/rejected": 4.093469142913818, "step": 5614 }, { "epoch": 1.24, "learning_rate": 3.3114379591458632e-06, "logits/chosen": -1.8497384786605835, "logits/rejected": -1.8309246301651, "logps/chosen": -21.608383178710938, "logps/rejected": -37.4062385559082, "loss": 0.1576, "rewards/accuracies": 1.0, "rewards/chosen": 2.665013551712036, "rewards/margins": 1.181111216545105, "rewards/rejected": 1.4839023351669312, "step": 5615 }, { "epoch": 1.24, "learning_rate": 3.309751040041544e-06, "logits/chosen": -1.821584939956665, "logits/rejected": -1.7415895462036133, "logps/chosen": -52.72108459472656, "logps/rejected": -10.122758865356445, "loss": 1.9012, "rewards/accuracies": 1.0, "rewards/chosen": 3.3219101428985596, "rewards/margins": 1.8527475595474243, "rewards/rejected": 1.4691625833511353, "step": 5616 }, { "epoch": 1.24, "learning_rate": 3.3080643381301536e-06, "logits/chosen": -1.8725560903549194, "logits/rejected": -1.8318045139312744, "logps/chosen": -53.27294158935547, "logps/rejected": -45.097877502441406, "loss": 0.3404, "rewards/accuracies": 1.0, "rewards/chosen": 3.487752676010132, "rewards/margins": 0.22616362571716309, "rewards/rejected": 3.2615890502929688, "step": 5617 }, { "epoch": 1.24, "learning_rate": 3.3063778536284263e-06, "logits/chosen": -1.8472880125045776, "logits/rejected": -1.8818942308425903, "logps/chosen": -310.90667724609375, "logps/rejected": -90.1229248046875, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": 11.795809745788574, "rewards/margins": 5.410587787628174, "rewards/rejected": 6.3852219581604, "step": 5618 }, { "epoch": 1.24, "learning_rate": 3.304691586753075e-06, "logits/chosen": -1.9186967611312866, "logits/rejected": -1.752494215965271, "logps/chosen": -76.25608825683594, "logps/rejected": -5.8273539543151855, "loss": 0.0697, "rewards/accuracies": 1.0, "rewards/chosen": 3.328080892562866, "rewards/margins": 2.347487449645996, "rewards/rejected": 0.9805933237075806, "step": 5619 }, { "epoch": 1.24, "learning_rate": 3.303005537720778e-06, "logits/chosen": -2.175251007080078, "logits/rejected": -2.0996694564819336, "logps/chosen": -134.31375122070312, "logps/rejected": -65.49336242675781, "loss": 0.1454, "rewards/accuracies": 1.0, "rewards/chosen": 6.857434272766113, "rewards/margins": 3.3851990699768066, "rewards/rejected": 3.4722352027893066, "step": 5620 }, { "epoch": 1.24, "learning_rate": 3.3013197067481916e-06, "logits/chosen": -1.9666913747787476, "logits/rejected": -1.9359378814697266, "logps/chosen": -115.42506408691406, "logps/rejected": -45.586212158203125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 6.373847961425781, "rewards/margins": 5.348899841308594, "rewards/rejected": 1.0249481201171875, "step": 5621 }, { "epoch": 1.24, "learning_rate": 3.2996340940519387e-06, "logits/chosen": -1.985135555267334, "logits/rejected": -1.863844871520996, "logps/chosen": -75.32657623291016, "logps/rejected": -36.10274124145508, "loss": 0.1286, "rewards/accuracies": 1.0, "rewards/chosen": 5.468624114990234, "rewards/margins": 1.5141901969909668, "rewards/rejected": 3.9544339179992676, "step": 5622 }, { "epoch": 1.24, "learning_rate": 3.2979486998486164e-06, "logits/chosen": -1.7917345762252808, "logits/rejected": -1.786757230758667, "logps/chosen": -63.263275146484375, "logps/rejected": -109.64673614501953, "loss": 0.5124, "rewards/accuracies": 0.0, "rewards/chosen": 4.720263957977295, "rewards/margins": -0.4474143981933594, "rewards/rejected": 5.167678356170654, "step": 5623 }, { "epoch": 1.24, "learning_rate": 3.296263524354797e-06, "logits/chosen": -2.0597755908966064, "logits/rejected": -2.0800864696502686, "logps/chosen": -27.124528884887695, "logps/rejected": -88.42723083496094, "loss": 0.4348, "rewards/accuracies": 0.0, "rewards/chosen": 2.4729833602905273, "rewards/margins": -0.32477831840515137, "rewards/rejected": 2.7977616786956787, "step": 5624 }, { "epoch": 1.25, "learning_rate": 3.294578567787017e-06, "logits/chosen": -1.9208532571792603, "logits/rejected": -1.9749021530151367, "logps/chosen": -26.593589782714844, "logps/rejected": -92.507080078125, "loss": 0.9572, "rewards/accuracies": 0.0, "rewards/chosen": 4.806769847869873, "rewards/margins": -1.7487034797668457, "rewards/rejected": 6.555473327636719, "step": 5625 }, { "epoch": 1.25, "learning_rate": 3.292893830361792e-06, "logits/chosen": -1.7177140712738037, "logits/rejected": -1.6662575006484985, "logps/chosen": -32.796730041503906, "logps/rejected": -48.56785583496094, "loss": 0.2469, "rewards/accuracies": 1.0, "rewards/chosen": 3.015854597091675, "rewards/margins": 1.167838215827942, "rewards/rejected": 1.848016381263733, "step": 5626 }, { "epoch": 1.25, "learning_rate": 3.2912093122956046e-06, "logits/chosen": -1.1807537078857422, "logits/rejected": -1.130096435546875, "logps/chosen": -35.35061264038086, "logps/rejected": -49.73712921142578, "loss": 0.3284, "rewards/accuracies": 1.0, "rewards/chosen": 2.293588399887085, "rewards/margins": 2.1000521183013916, "rewards/rejected": 0.19353638589382172, "step": 5627 }, { "epoch": 1.25, "learning_rate": 3.289525013804915e-06, "logits/chosen": -1.883803367614746, "logits/rejected": -1.9009125232696533, "logps/chosen": -25.79132843017578, "logps/rejected": -59.587364196777344, "loss": 0.3462, "rewards/accuracies": 1.0, "rewards/chosen": 4.387096405029297, "rewards/margins": 0.04685211181640625, "rewards/rejected": 4.340244293212891, "step": 5628 }, { "epoch": 1.25, "learning_rate": 3.287840935106147e-06, "logits/chosen": -1.8320344686508179, "logits/rejected": -1.7359262704849243, "logps/chosen": -51.81974792480469, "logps/rejected": -67.02796936035156, "loss": 0.4474, "rewards/accuracies": 1.0, "rewards/chosen": 6.329535961151123, "rewards/margins": 1.2985634803771973, "rewards/rejected": 5.030972480773926, "step": 5629 }, { "epoch": 1.25, "learning_rate": 3.286157076415705e-06, "logits/chosen": -2.008578062057495, "logits/rejected": -1.890516996383667, "logps/chosen": -112.15045928955078, "logps/rejected": -54.43316650390625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 10.094300270080566, "rewards/margins": 7.323670387268066, "rewards/rejected": 2.7706298828125, "step": 5630 }, { "epoch": 1.25, "learning_rate": 3.284473437949957e-06, "logits/chosen": -2.125624418258667, "logits/rejected": -2.081796884536743, "logps/chosen": -50.38133239746094, "logps/rejected": -40.89491653442383, "loss": 0.1284, "rewards/accuracies": 1.0, "rewards/chosen": 4.548093318939209, "rewards/margins": 2.9700026512145996, "rewards/rejected": 1.5780906677246094, "step": 5631 }, { "epoch": 1.25, "learning_rate": 3.282790019925249e-06, "logits/chosen": -1.9486422538757324, "logits/rejected": -1.9041532278060913, "logps/chosen": -10.158831596374512, "logps/rejected": -28.94114875793457, "loss": 1.3443, "rewards/accuracies": 0.0, "rewards/chosen": 2.408975601196289, "rewards/margins": -1.8223209381103516, "rewards/rejected": 4.231296539306641, "step": 5632 }, { "epoch": 1.25, "learning_rate": 3.2811068225578955e-06, "logits/chosen": -1.7862476110458374, "logits/rejected": -1.754679799079895, "logps/chosen": -80.96131896972656, "logps/rejected": -77.4899673461914, "loss": 0.1068, "rewards/accuracies": 1.0, "rewards/chosen": 6.739715576171875, "rewards/margins": 1.9871139526367188, "rewards/rejected": 4.752601623535156, "step": 5633 }, { "epoch": 1.25, "learning_rate": 3.2794238460641837e-06, "logits/chosen": -1.828972339630127, "logits/rejected": -1.7733888626098633, "logps/chosen": -75.42243957519531, "logps/rejected": -80.96450805664062, "loss": 0.5988, "rewards/accuracies": 1.0, "rewards/chosen": 2.9200074672698975, "rewards/margins": 1.37314772605896, "rewards/rejected": 1.5468597412109375, "step": 5634 }, { "epoch": 1.25, "learning_rate": 3.277741090660371e-06, "logits/chosen": -1.9212530851364136, "logits/rejected": -1.9512795209884644, "logps/chosen": -36.370765686035156, "logps/rejected": -106.84468841552734, "loss": 1.922, "rewards/accuracies": 0.0, "rewards/chosen": 4.394693851470947, "rewards/margins": -3.8138461112976074, "rewards/rejected": 8.208539962768555, "step": 5635 }, { "epoch": 1.25, "learning_rate": 3.276058556562687e-06, "logits/chosen": -1.9667515754699707, "logits/rejected": -1.937215805053711, "logps/chosen": -100.44013977050781, "logps/rejected": -58.704444885253906, "loss": 0.1557, "rewards/accuracies": 1.0, "rewards/chosen": 4.404982089996338, "rewards/margins": 1.2366111278533936, "rewards/rejected": 3.1683709621429443, "step": 5636 }, { "epoch": 1.25, "learning_rate": 3.274376243987336e-06, "logits/chosen": -1.926486849784851, "logits/rejected": -1.8610302209854126, "logps/chosen": -98.73457336425781, "logps/rejected": -53.19677734375, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": 7.623918056488037, "rewards/margins": 3.3965377807617188, "rewards/rejected": 4.227380275726318, "step": 5637 }, { "epoch": 1.25, "learning_rate": 3.2726941531504867e-06, "logits/chosen": -1.7759416103363037, "logits/rejected": -1.5855399370193481, "logps/chosen": -47.798606872558594, "logps/rejected": -35.329811096191406, "loss": 0.1292, "rewards/accuracies": 1.0, "rewards/chosen": 3.3866219520568848, "rewards/margins": 1.4235912561416626, "rewards/rejected": 1.9630306959152222, "step": 5638 }, { "epoch": 1.25, "learning_rate": 3.271012284268289e-06, "logits/chosen": -1.9976215362548828, "logits/rejected": -2.0244052410125732, "logps/chosen": -68.48963928222656, "logps/rejected": -75.46570587158203, "loss": 0.1988, "rewards/accuracies": 1.0, "rewards/chosen": 7.299800395965576, "rewards/margins": 1.8784847259521484, "rewards/rejected": 5.421315670013428, "step": 5639 }, { "epoch": 1.25, "learning_rate": 3.269330637556856e-06, "logits/chosen": -2.04196834564209, "logits/rejected": -2.0156941413879395, "logps/chosen": -61.338157653808594, "logps/rejected": -59.680660247802734, "loss": 0.2852, "rewards/accuracies": 1.0, "rewards/chosen": 4.939972877502441, "rewards/margins": 0.2905917167663574, "rewards/rejected": 4.649381160736084, "step": 5640 }, { "epoch": 1.25, "learning_rate": 3.267649213232276e-06, "logits/chosen": -1.8516724109649658, "logits/rejected": -1.8180898427963257, "logps/chosen": -115.24673461914062, "logps/rejected": -136.271728515625, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 9.599260330200195, "rewards/margins": 3.0495991706848145, "rewards/rejected": 6.549661159515381, "step": 5641 }, { "epoch": 1.25, "learning_rate": 3.2659680115106053e-06, "logits/chosen": -1.8953028917312622, "logits/rejected": -1.8465614318847656, "logps/chosen": -106.3902587890625, "logps/rejected": -16.20901107788086, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": 4.936145305633545, "rewards/margins": 4.122365951538086, "rewards/rejected": 0.8137792944908142, "step": 5642 }, { "epoch": 1.25, "learning_rate": 3.264287032607879e-06, "logits/chosen": -2.081099033355713, "logits/rejected": -2.004351854324341, "logps/chosen": -87.05841827392578, "logps/rejected": -57.626922607421875, "loss": 0.2063, "rewards/accuracies": 1.0, "rewards/chosen": 8.002894401550293, "rewards/margins": 1.4309782981872559, "rewards/rejected": 6.571916103363037, "step": 5643 }, { "epoch": 1.25, "learning_rate": 3.262606276740096e-06, "logits/chosen": -1.8645600080490112, "logits/rejected": -1.9022936820983887, "logps/chosen": -35.74224090576172, "logps/rejected": -94.66128540039062, "loss": 0.1627, "rewards/accuracies": 1.0, "rewards/chosen": 4.462107181549072, "rewards/margins": 1.3141114711761475, "rewards/rejected": 3.147995710372925, "step": 5644 }, { "epoch": 1.25, "learning_rate": 3.2609257441232282e-06, "logits/chosen": -1.6423687934875488, "logits/rejected": -1.665899634361267, "logps/chosen": -44.80894088745117, "logps/rejected": -74.09004211425781, "loss": 0.6722, "rewards/accuracies": 0.0, "rewards/chosen": 3.3456287384033203, "rewards/margins": -0.3396031856536865, "rewards/rejected": 3.685231924057007, "step": 5645 }, { "epoch": 1.25, "learning_rate": 3.259245434973224e-06, "logits/chosen": -2.3353381156921387, "logits/rejected": -2.355287551879883, "logps/chosen": -53.08282470703125, "logps/rejected": -67.76655578613281, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": 3.784066915512085, "rewards/margins": 2.4616289138793945, "rewards/rejected": 1.32243812084198, "step": 5646 }, { "epoch": 1.25, "learning_rate": 3.2575653495059945e-06, "logits/chosen": -1.9285850524902344, "logits/rejected": -1.9332101345062256, "logps/chosen": -42.374916076660156, "logps/rejected": -76.94680786132812, "loss": 0.106, "rewards/accuracies": 1.0, "rewards/chosen": 4.522176265716553, "rewards/margins": 1.6105763912200928, "rewards/rejected": 2.91159987449646, "step": 5647 }, { "epoch": 1.25, "learning_rate": 3.255885487937431e-06, "logits/chosen": -1.86871337890625, "logits/rejected": -1.856915831565857, "logps/chosen": -43.10618209838867, "logps/rejected": -60.340492248535156, "loss": 0.4687, "rewards/accuracies": 1.0, "rewards/chosen": 4.278348445892334, "rewards/margins": 0.11060333251953125, "rewards/rejected": 4.167745113372803, "step": 5648 }, { "epoch": 1.25, "learning_rate": 3.2542058504833885e-06, "logits/chosen": -2.312136173248291, "logits/rejected": -2.3118627071380615, "logps/chosen": -57.24714660644531, "logps/rejected": -77.55831909179688, "loss": 0.1419, "rewards/accuracies": 1.0, "rewards/chosen": 4.223186016082764, "rewards/margins": 1.1416170597076416, "rewards/rejected": 3.081568956375122, "step": 5649 }, { "epoch": 1.25, "learning_rate": 3.2525264373596983e-06, "logits/chosen": -2.0621602535247803, "logits/rejected": -2.051393747329712, "logps/chosen": -20.433032989501953, "logps/rejected": -32.589210510253906, "loss": 0.7013, "rewards/accuracies": 0.0, "rewards/chosen": 2.0341198444366455, "rewards/margins": -0.20723772048950195, "rewards/rejected": 2.2413575649261475, "step": 5650 }, { "epoch": 1.25, "learning_rate": 3.250847248782158e-06, "logits/chosen": -2.180593967437744, "logits/rejected": -2.16373872756958, "logps/chosen": -42.756248474121094, "logps/rejected": -49.239585876464844, "loss": 1.4854, "rewards/accuracies": 0.0, "rewards/chosen": 3.6353187561035156, "rewards/margins": -2.330392360687256, "rewards/rejected": 5.9657111167907715, "step": 5651 }, { "epoch": 1.25, "learning_rate": 3.2491682849665433e-06, "logits/chosen": -1.9694539308547974, "logits/rejected": -1.962618112564087, "logps/chosen": -118.36934661865234, "logps/rejected": -135.20440673828125, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": 8.825956344604492, "rewards/margins": 1.6900553703308105, "rewards/rejected": 7.135900974273682, "step": 5652 }, { "epoch": 1.25, "learning_rate": 3.247489546128596e-06, "logits/chosen": -1.5088533163070679, "logits/rejected": -1.4924638271331787, "logps/chosen": -48.99586868286133, "logps/rejected": -32.265968322753906, "loss": 0.1716, "rewards/accuracies": 1.0, "rewards/chosen": 2.485239028930664, "rewards/margins": 1.0170832872390747, "rewards/rejected": 1.4681557416915894, "step": 5653 }, { "epoch": 1.25, "learning_rate": 3.2458110324840265e-06, "logits/chosen": -1.6252893209457397, "logits/rejected": -1.5369051694869995, "logps/chosen": -56.0103645324707, "logps/rejected": -13.360011100769043, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": 3.5618221759796143, "rewards/margins": 3.458204984664917, "rewards/rejected": 0.1036170944571495, "step": 5654 }, { "epoch": 1.25, "learning_rate": 3.244132744248524e-06, "logits/chosen": -2.002032518386841, "logits/rejected": -1.8995954990386963, "logps/chosen": -83.30426025390625, "logps/rejected": -22.740985870361328, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": 7.889392375946045, "rewards/margins": 4.653206825256348, "rewards/rejected": 3.2361857891082764, "step": 5655 }, { "epoch": 1.25, "learning_rate": 3.242454681637741e-06, "logits/chosen": -1.9966098070144653, "logits/rejected": -2.0225722789764404, "logps/chosen": -36.576622009277344, "logps/rejected": -45.73936462402344, "loss": 0.1938, "rewards/accuracies": 1.0, "rewards/chosen": 4.1144890785217285, "rewards/margins": 1.0044867992401123, "rewards/rejected": 3.110002279281616, "step": 5656 }, { "epoch": 1.25, "learning_rate": 3.2407768448673086e-06, "logits/chosen": -1.7947016954421997, "logits/rejected": -1.739123821258545, "logps/chosen": -41.43904113769531, "logps/rejected": -62.938804626464844, "loss": 0.093, "rewards/accuracies": 1.0, "rewards/chosen": 3.8302979469299316, "rewards/margins": 1.6039917469024658, "rewards/rejected": 2.226306200027466, "step": 5657 }, { "epoch": 1.25, "learning_rate": 3.2390992341528217e-06, "logits/chosen": -1.9733104705810547, "logits/rejected": -1.9626966714859009, "logps/chosen": -36.3406982421875, "logps/rejected": -65.77733612060547, "loss": 0.5222, "rewards/accuracies": 1.0, "rewards/chosen": 5.691716194152832, "rewards/margins": 1.4424858093261719, "rewards/rejected": 4.24923038482666, "step": 5658 }, { "epoch": 1.25, "learning_rate": 3.237421849709851e-06, "logits/chosen": -1.7224384546279907, "logits/rejected": -1.6809817552566528, "logps/chosen": -23.3280086517334, "logps/rejected": -6.785216808319092, "loss": 0.2593, "rewards/accuracies": 1.0, "rewards/chosen": 2.4644076824188232, "rewards/margins": 0.7959672212600708, "rewards/rejected": 1.6684404611587524, "step": 5659 }, { "epoch": 1.25, "learning_rate": 3.2357446917539337e-06, "logits/chosen": -1.9422670602798462, "logits/rejected": -1.9422670602798462, "logps/chosen": -62.79034423828125, "logps/rejected": -62.79034423828125, "loss": 0.3486, "rewards/accuracies": 0.0, "rewards/chosen": 8.50683307647705, "rewards/margins": 0.0, "rewards/rejected": 8.50683307647705, "step": 5660 }, { "epoch": 1.25, "learning_rate": 3.234067760500584e-06, "logits/chosen": -1.7771035432815552, "logits/rejected": -1.7771035432815552, "logps/chosen": -65.31461334228516, "logps/rejected": -65.31461334228516, "loss": 0.3556, "rewards/accuracies": 0.0, "rewards/chosen": 5.263944149017334, "rewards/margins": 0.0, "rewards/rejected": 5.263944149017334, "step": 5661 }, { "epoch": 1.25, "learning_rate": 3.2323910561652798e-06, "logits/chosen": -1.7505220174789429, "logits/rejected": -1.6924333572387695, "logps/chosen": -39.107112884521484, "logps/rejected": -48.5917854309082, "loss": 0.6406, "rewards/accuracies": 0.0, "rewards/chosen": 2.7473926544189453, "rewards/margins": -0.883110761642456, "rewards/rejected": 3.6305034160614014, "step": 5662 }, { "epoch": 1.25, "learning_rate": 3.230714578963478e-06, "logits/chosen": -2.059056520462036, "logits/rejected": -2.075805425643921, "logps/chosen": -126.19600677490234, "logps/rejected": -66.21016693115234, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": 9.195568084716797, "rewards/margins": 3.3633599281311035, "rewards/rejected": 5.832208156585693, "step": 5663 }, { "epoch": 1.25, "learning_rate": 3.2290383291105996e-06, "logits/chosen": -1.9201971292495728, "logits/rejected": -1.8541395664215088, "logps/chosen": -33.448333740234375, "logps/rejected": -6.970877647399902, "loss": 0.161, "rewards/accuracies": 1.0, "rewards/chosen": 2.192850112915039, "rewards/margins": 1.4530781507492065, "rewards/rejected": 0.7397719621658325, "step": 5664 }, { "epoch": 1.25, "learning_rate": 3.2273623068220362e-06, "logits/chosen": -1.8618072271347046, "logits/rejected": -1.8688750267028809, "logps/chosen": -48.5960693359375, "logps/rejected": -72.65567016601562, "loss": 0.3286, "rewards/accuracies": 1.0, "rewards/chosen": 4.328479290008545, "rewards/margins": 0.9219255447387695, "rewards/rejected": 3.4065537452697754, "step": 5665 }, { "epoch": 1.25, "learning_rate": 3.2256865123131575e-06, "logits/chosen": -2.0649666786193848, "logits/rejected": -1.9782040119171143, "logps/chosen": -74.36258697509766, "logps/rejected": -29.706558227539062, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 6.199146270751953, "rewards/margins": 1.8858704566955566, "rewards/rejected": 4.3132758140563965, "step": 5666 }, { "epoch": 1.25, "learning_rate": 3.224010945799295e-06, "logits/chosen": -2.196682929992676, "logits/rejected": -2.146437644958496, "logps/chosen": -118.8431167602539, "logps/rejected": -36.39644241333008, "loss": 0.1208, "rewards/accuracies": 1.0, "rewards/chosen": 3.4401192665100098, "rewards/margins": 1.3280644416809082, "rewards/rejected": 2.1120548248291016, "step": 5667 }, { "epoch": 1.25, "learning_rate": 3.2223356074957592e-06, "logits/chosen": -1.7567014694213867, "logits/rejected": -1.8366849422454834, "logps/chosen": -60.67378616333008, "logps/rejected": -79.77279663085938, "loss": 1.0713, "rewards/accuracies": 0.0, "rewards/chosen": 6.7215728759765625, "rewards/margins": -1.9001312255859375, "rewards/rejected": 8.6217041015625, "step": 5668 }, { "epoch": 1.25, "learning_rate": 3.220660497617821e-06, "logits/chosen": -2.177727460861206, "logits/rejected": -2.1403658390045166, "logps/chosen": -70.9915542602539, "logps/rejected": -48.63197708129883, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": 7.174074649810791, "rewards/margins": 3.832169771194458, "rewards/rejected": 3.341904878616333, "step": 5669 }, { "epoch": 1.25, "learning_rate": 3.2189856163807336e-06, "logits/chosen": -1.982128620147705, "logits/rejected": -1.9482542276382446, "logps/chosen": -62.795997619628906, "logps/rejected": -113.50720977783203, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 4.624310493469238, "rewards/margins": 2.7258965969085693, "rewards/rejected": 1.898413896560669, "step": 5670 }, { "epoch": 1.26, "learning_rate": 3.2173109639997112e-06, "logits/chosen": -2.030161142349243, "logits/rejected": -2.013495922088623, "logps/chosen": -34.33314514160156, "logps/rejected": -44.93553161621094, "loss": 0.3287, "rewards/accuracies": 1.0, "rewards/chosen": 3.8253281116485596, "rewards/margins": 0.7550902366638184, "rewards/rejected": 3.070237874984741, "step": 5671 }, { "epoch": 1.26, "learning_rate": 3.215636540689947e-06, "logits/chosen": -1.8161845207214355, "logits/rejected": -1.8254510164260864, "logps/chosen": -61.46186447143555, "logps/rejected": -77.9609375, "loss": 0.1247, "rewards/accuracies": 1.0, "rewards/chosen": 3.991926908493042, "rewards/margins": 1.4077701568603516, "rewards/rejected": 2.5841567516326904, "step": 5672 }, { "epoch": 1.26, "learning_rate": 3.213962346666596e-06, "logits/chosen": -2.1199920177459717, "logits/rejected": -2.0777134895324707, "logps/chosen": -99.14559936523438, "logps/rejected": -18.87179946899414, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 5.0047807693481445, "rewards/margins": 4.997615814208984, "rewards/rejected": 0.007164955139160156, "step": 5673 }, { "epoch": 1.26, "learning_rate": 3.2122883821447915e-06, "logits/chosen": -1.7204149961471558, "logits/rejected": -1.7003483772277832, "logps/chosen": -48.46185302734375, "logps/rejected": -66.76231384277344, "loss": 0.1745, "rewards/accuracies": 1.0, "rewards/chosen": 5.231588840484619, "rewards/margins": 0.9711480140686035, "rewards/rejected": 4.260440826416016, "step": 5674 }, { "epoch": 1.26, "learning_rate": 3.210614647339631e-06, "logits/chosen": -1.84537672996521, "logits/rejected": -1.773207664489746, "logps/chosen": -152.0075225830078, "logps/rejected": -103.08856201171875, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": 8.364250183105469, "rewards/margins": 3.684382438659668, "rewards/rejected": 4.679867744445801, "step": 5675 }, { "epoch": 1.26, "learning_rate": 3.2089411424661864e-06, "logits/chosen": -1.9336174726486206, "logits/rejected": -1.8946175575256348, "logps/chosen": -85.44672393798828, "logps/rejected": -100.60243225097656, "loss": 0.0822, "rewards/accuracies": 1.0, "rewards/chosen": 10.772396087646484, "rewards/margins": 1.9351892471313477, "rewards/rejected": 8.837206840515137, "step": 5676 }, { "epoch": 1.26, "learning_rate": 3.2072678677395015e-06, "logits/chosen": -1.6406776905059814, "logits/rejected": -1.6425471305847168, "logps/chosen": -36.0772705078125, "logps/rejected": -59.92244338989258, "loss": 0.5917, "rewards/accuracies": 1.0, "rewards/chosen": 4.601760387420654, "rewards/margins": 0.716184139251709, "rewards/rejected": 3.8855762481689453, "step": 5677 }, { "epoch": 1.26, "learning_rate": 3.205594823374583e-06, "logits/chosen": -2.1039485931396484, "logits/rejected": -2.085733652114868, "logps/chosen": -114.53703308105469, "logps/rejected": -72.46565246582031, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 5.874748229980469, "rewards/margins": 3.061678171157837, "rewards/rejected": 2.813070058822632, "step": 5678 }, { "epoch": 1.26, "learning_rate": 3.2039220095864187e-06, "logits/chosen": -2.096534490585327, "logits/rejected": -2.0513851642608643, "logps/chosen": -48.12481689453125, "logps/rejected": -17.07413673400879, "loss": 0.2848, "rewards/accuracies": 1.0, "rewards/chosen": 4.265827178955078, "rewards/margins": 2.893965244293213, "rewards/rejected": 1.3718618154525757, "step": 5679 }, { "epoch": 1.26, "learning_rate": 3.2022494265899563e-06, "logits/chosen": -1.7634210586547852, "logits/rejected": -1.700283169746399, "logps/chosen": -36.56627655029297, "logps/rejected": -27.040084838867188, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 2.9638772010803223, "rewards/margins": 0.5966370105743408, "rewards/rejected": 2.3672401905059814, "step": 5680 }, { "epoch": 1.26, "learning_rate": 3.2005770746001232e-06, "logits/chosen": -2.004591941833496, "logits/rejected": -2.004591941833496, "logps/chosen": -52.435001373291016, "logps/rejected": -52.435001373291016, "loss": 0.3952, "rewards/accuracies": 0.0, "rewards/chosen": 5.398623943328857, "rewards/margins": 0.0, "rewards/rejected": 5.398623943328857, "step": 5681 }, { "epoch": 1.26, "learning_rate": 3.1989049538318084e-06, "logits/chosen": -1.7436277866363525, "logits/rejected": -1.7512879371643066, "logps/chosen": -48.452842712402344, "logps/rejected": -36.154666900634766, "loss": 0.1769, "rewards/accuracies": 1.0, "rewards/chosen": 3.0481834411621094, "rewards/margins": 0.8950839042663574, "rewards/rejected": 2.153099536895752, "step": 5682 }, { "epoch": 1.26, "learning_rate": 3.1972330644998796e-06, "logits/chosen": -1.6938081979751587, "logits/rejected": -1.8100618124008179, "logps/chosen": -42.51050567626953, "logps/rejected": -28.197322845458984, "loss": 0.1682, "rewards/accuracies": 1.0, "rewards/chosen": 3.4924263954162598, "rewards/margins": 1.270099401473999, "rewards/rejected": 2.2223269939422607, "step": 5683 }, { "epoch": 1.26, "learning_rate": 3.195561406819166e-06, "logits/chosen": -1.937222957611084, "logits/rejected": -1.919393539428711, "logps/chosen": -41.58310317993164, "logps/rejected": -88.52403259277344, "loss": 0.2078, "rewards/accuracies": 1.0, "rewards/chosen": 6.247342586517334, "rewards/margins": 0.9853858947753906, "rewards/rejected": 5.261956691741943, "step": 5684 }, { "epoch": 1.26, "learning_rate": 3.1938899810044745e-06, "logits/chosen": -1.6852158308029175, "logits/rejected": -1.612494707107544, "logps/chosen": -55.75135803222656, "logps/rejected": -44.55747985839844, "loss": 0.1696, "rewards/accuracies": 1.0, "rewards/chosen": 4.024720191955566, "rewards/margins": 0.9982187747955322, "rewards/rejected": 3.026501417160034, "step": 5685 }, { "epoch": 1.26, "learning_rate": 3.19221878727058e-06, "logits/chosen": -1.806437373161316, "logits/rejected": -1.7821887731552124, "logps/chosen": -52.46868133544922, "logps/rejected": -102.74529266357422, "loss": 0.6778, "rewards/accuracies": 1.0, "rewards/chosen": 6.002579689025879, "rewards/margins": 2.1755096912384033, "rewards/rejected": 3.8270699977874756, "step": 5686 }, { "epoch": 1.26, "learning_rate": 3.190547825832223e-06, "logits/chosen": -1.9461225271224976, "logits/rejected": -1.8840841054916382, "logps/chosen": -40.139163970947266, "logps/rejected": -45.3909912109375, "loss": 0.241, "rewards/accuracies": 1.0, "rewards/chosen": 4.267349720001221, "rewards/margins": 1.3592746257781982, "rewards/rejected": 2.9080750942230225, "step": 5687 }, { "epoch": 1.26, "learning_rate": 3.1888770969041225e-06, "logits/chosen": -1.8338171243667603, "logits/rejected": -1.848538875579834, "logps/chosen": -65.32032775878906, "logps/rejected": -78.16944885253906, "loss": 0.1868, "rewards/accuracies": 1.0, "rewards/chosen": 7.8647661209106445, "rewards/margins": 0.8254823684692383, "rewards/rejected": 7.039283752441406, "step": 5688 }, { "epoch": 1.26, "learning_rate": 3.1872066007009595e-06, "logits/chosen": -1.9537423849105835, "logits/rejected": -1.8340996503829956, "logps/chosen": -49.04168701171875, "logps/rejected": -49.43828582763672, "loss": 0.1757, "rewards/accuracies": 1.0, "rewards/chosen": 2.390383243560791, "rewards/margins": 1.3444199562072754, "rewards/rejected": 1.0459632873535156, "step": 5689 }, { "epoch": 1.26, "learning_rate": 3.185536337437393e-06, "logits/chosen": -1.785077691078186, "logits/rejected": -1.7540720701217651, "logps/chosen": -75.62135314941406, "logps/rejected": -73.42293548583984, "loss": 0.2032, "rewards/accuracies": 1.0, "rewards/chosen": 7.136828899383545, "rewards/margins": 0.8320884704589844, "rewards/rejected": 6.3047404289245605, "step": 5690 }, { "epoch": 1.26, "learning_rate": 3.1838663073280436e-06, "logits/chosen": -2.011831521987915, "logits/rejected": -1.9649139642715454, "logps/chosen": -81.36936950683594, "logps/rejected": -116.44532775878906, "loss": 0.0682, "rewards/accuracies": 1.0, "rewards/chosen": 10.056609153747559, "rewards/margins": 1.9388551712036133, "rewards/rejected": 8.117753982543945, "step": 5691 }, { "epoch": 1.26, "learning_rate": 3.1821965105875085e-06, "logits/chosen": -1.658774495124817, "logits/rejected": -1.3919763565063477, "logps/chosen": -60.285709381103516, "logps/rejected": -40.57650375366211, "loss": 0.0581, "rewards/accuracies": 1.0, "rewards/chosen": 3.1471707820892334, "rewards/margins": 2.137589454650879, "rewards/rejected": 1.009581446647644, "step": 5692 }, { "epoch": 1.26, "learning_rate": 3.1805269474303507e-06, "logits/chosen": -2.074683904647827, "logits/rejected": -1.7359529733657837, "logps/chosen": -48.74613952636719, "logps/rejected": -30.507429122924805, "loss": 0.6711, "rewards/accuracies": 1.0, "rewards/chosen": 4.420590400695801, "rewards/margins": 1.2485098838806152, "rewards/rejected": 3.1720805168151855, "step": 5693 }, { "epoch": 1.26, "learning_rate": 3.1788576180711084e-06, "logits/chosen": -1.9848895072937012, "logits/rejected": -1.8220809698104858, "logps/chosen": -110.53413391113281, "logps/rejected": -55.80491638183594, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": 5.527858257293701, "rewards/margins": 6.029569625854492, "rewards/rejected": -0.5017113089561462, "step": 5694 }, { "epoch": 1.26, "learning_rate": 3.177188522724284e-06, "logits/chosen": -2.0742239952087402, "logits/rejected": -2.034353494644165, "logps/chosen": -121.40730285644531, "logps/rejected": -47.7793083190918, "loss": 0.115, "rewards/accuracies": 1.0, "rewards/chosen": 7.141357421875, "rewards/margins": 4.0998454093933105, "rewards/rejected": 3.0415120124816895, "step": 5695 }, { "epoch": 1.26, "learning_rate": 3.17551966160435e-06, "logits/chosen": -1.8215186595916748, "logits/rejected": -1.8142797946929932, "logps/chosen": -32.228050231933594, "logps/rejected": -69.71768188476562, "loss": 0.5893, "rewards/accuracies": 0.0, "rewards/chosen": 4.418605327606201, "rewards/margins": -0.7122716903686523, "rewards/rejected": 5.1308770179748535, "step": 5696 }, { "epoch": 1.26, "learning_rate": 3.1738510349257556e-06, "logits/chosen": -1.9140280485153198, "logits/rejected": -1.8604024648666382, "logps/chosen": -69.90628051757812, "logps/rejected": -79.89783477783203, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 5.140498638153076, "rewards/margins": 2.2415080070495605, "rewards/rejected": 2.8989906311035156, "step": 5697 }, { "epoch": 1.26, "learning_rate": 3.172182642902912e-06, "logits/chosen": -1.748897671699524, "logits/rejected": -1.7380834817886353, "logps/chosen": -21.46581268310547, "logps/rejected": -28.818050384521484, "loss": 0.3699, "rewards/accuracies": 1.0, "rewards/chosen": 3.717390775680542, "rewards/margins": 0.24245643615722656, "rewards/rejected": 3.4749343395233154, "step": 5698 }, { "epoch": 1.26, "learning_rate": 3.1705144857502064e-06, "logits/chosen": -1.677168369293213, "logits/rejected": -1.6275863647460938, "logps/chosen": -103.66490173339844, "logps/rejected": -79.51510620117188, "loss": 0.0975, "rewards/accuracies": 1.0, "rewards/chosen": 5.102459907531738, "rewards/margins": 2.477465867996216, "rewards/rejected": 2.6249940395355225, "step": 5699 }, { "epoch": 1.26, "learning_rate": 3.16884656368199e-06, "logits/chosen": -2.1965808868408203, "logits/rejected": -2.1965808868408203, "logps/chosen": -21.93290901184082, "logps/rejected": -21.93290901184082, "loss": 0.4046, "rewards/accuracies": 0.0, "rewards/chosen": 2.296088218688965, "rewards/margins": 0.0, "rewards/rejected": 2.296088218688965, "step": 5700 }, { "epoch": 1.26, "learning_rate": 3.16717887691259e-06, "logits/chosen": -1.9415549039840698, "logits/rejected": -1.9158015251159668, "logps/chosen": -74.61393737792969, "logps/rejected": -41.13624572753906, "loss": 0.0738, "rewards/accuracies": 1.0, "rewards/chosen": 5.745543003082275, "rewards/margins": 1.8627808094024658, "rewards/rejected": 3.8827621936798096, "step": 5701 }, { "epoch": 1.26, "learning_rate": 3.165511425656296e-06, "logits/chosen": -2.014600992202759, "logits/rejected": -2.009641647338867, "logps/chosen": -47.462669372558594, "logps/rejected": -67.75035858154297, "loss": 0.1314, "rewards/accuracies": 1.0, "rewards/chosen": 3.918982744216919, "rewards/margins": 2.415177345275879, "rewards/rejected": 1.5038055181503296, "step": 5702 }, { "epoch": 1.26, "learning_rate": 3.1638442101273757e-06, "logits/chosen": -2.1815030574798584, "logits/rejected": -2.177103042602539, "logps/chosen": -29.116424560546875, "logps/rejected": -43.8913688659668, "loss": 1.2947, "rewards/accuracies": 0.0, "rewards/chosen": 2.442112445831299, "rewards/margins": -1.1485931873321533, "rewards/rejected": 3.590705633163452, "step": 5703 }, { "epoch": 1.26, "learning_rate": 3.1621772305400603e-06, "logits/chosen": -1.5294710397720337, "logits/rejected": -1.5585824251174927, "logps/chosen": -24.023040771484375, "logps/rejected": -52.737022399902344, "loss": 1.1804, "rewards/accuracies": 1.0, "rewards/chosen": 2.9447269439697266, "rewards/margins": 0.18738055229187012, "rewards/rejected": 2.7573463916778564, "step": 5704 }, { "epoch": 1.26, "learning_rate": 3.160510487108551e-06, "logits/chosen": -1.7121121883392334, "logits/rejected": -1.7205698490142822, "logps/chosen": -51.06752014160156, "logps/rejected": -99.90887451171875, "loss": 0.369, "rewards/accuracies": 1.0, "rewards/chosen": 6.166938781738281, "rewards/margins": 0.06141328811645508, "rewards/rejected": 6.105525493621826, "step": 5705 }, { "epoch": 1.26, "learning_rate": 3.158843980047024e-06, "logits/chosen": -1.6839518547058105, "logits/rejected": -1.5853593349456787, "logps/chosen": -102.04347229003906, "logps/rejected": -67.39755249023438, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 9.513633728027344, "rewards/margins": 6.325350761413574, "rewards/rejected": 3.1882827281951904, "step": 5706 }, { "epoch": 1.26, "learning_rate": 3.1571777095696186e-06, "logits/chosen": -1.4953606128692627, "logits/rejected": -1.5085493326187134, "logps/chosen": -18.440303802490234, "logps/rejected": -58.385414123535156, "loss": 0.2411, "rewards/accuracies": 1.0, "rewards/chosen": 2.133636236190796, "rewards/margins": 1.1487934589385986, "rewards/rejected": 0.9848427176475525, "step": 5707 }, { "epoch": 1.26, "learning_rate": 3.155511675890448e-06, "logits/chosen": -2.0260026454925537, "logits/rejected": -1.6908891201019287, "logps/chosen": -105.26066589355469, "logps/rejected": -87.47522735595703, "loss": 0.1396, "rewards/accuracies": 1.0, "rewards/chosen": 5.13832426071167, "rewards/margins": 1.8905284404754639, "rewards/rejected": 3.247795820236206, "step": 5708 }, { "epoch": 1.26, "learning_rate": 3.153845879223592e-06, "logits/chosen": -2.12660813331604, "logits/rejected": -2.112440586090088, "logps/chosen": -125.157958984375, "logps/rejected": -44.67491149902344, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 7.919569492340088, "rewards/margins": 6.181595802307129, "rewards/rejected": 1.7379734516143799, "step": 5709 }, { "epoch": 1.26, "learning_rate": 3.152180319783104e-06, "logits/chosen": -1.8090860843658447, "logits/rejected": -1.6477445363998413, "logps/chosen": -128.3328094482422, "logps/rejected": -42.118927001953125, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": 6.219098091125488, "rewards/margins": 3.748988389968872, "rewards/rejected": 2.470109701156616, "step": 5710 }, { "epoch": 1.26, "learning_rate": 3.150514997783001e-06, "logits/chosen": -1.9748197793960571, "logits/rejected": -1.9348031282424927, "logps/chosen": -105.51405334472656, "logps/rejected": -42.22701644897461, "loss": 0.0447, "rewards/accuracies": 1.0, "rewards/chosen": 6.3765459060668945, "rewards/margins": 3.328619956970215, "rewards/rejected": 3.0479259490966797, "step": 5711 }, { "epoch": 1.26, "learning_rate": 3.148849913437277e-06, "logits/chosen": -1.8912073373794556, "logits/rejected": -1.7746833562850952, "logps/chosen": -58.798545837402344, "logps/rejected": -73.43672943115234, "loss": 0.1827, "rewards/accuracies": 1.0, "rewards/chosen": 3.2194314002990723, "rewards/margins": 1.1026222705841064, "rewards/rejected": 2.116809129714966, "step": 5712 }, { "epoch": 1.26, "learning_rate": 3.1471850669598865e-06, "logits/chosen": -1.807265043258667, "logits/rejected": -1.7573866844177246, "logps/chosen": -33.361785888671875, "logps/rejected": -17.806350708007812, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 3.757359266281128, "rewards/margins": 3.0971083641052246, "rewards/rejected": 0.6602508425712585, "step": 5713 }, { "epoch": 1.26, "learning_rate": 3.145520458564763e-06, "logits/chosen": -1.7487324476242065, "logits/rejected": -1.7107406854629517, "logps/chosen": -28.325284957885742, "logps/rejected": -47.51046371459961, "loss": 0.4373, "rewards/accuracies": 0.0, "rewards/chosen": 3.4115707874298096, "rewards/margins": -0.09202241897583008, "rewards/rejected": 3.5035932064056396, "step": 5714 }, { "epoch": 1.26, "learning_rate": 3.1438560884658025e-06, "logits/chosen": -1.8511637449264526, "logits/rejected": -1.7666120529174805, "logps/chosen": -123.1754150390625, "logps/rejected": -11.52249526977539, "loss": 0.033, "rewards/accuracies": 1.0, "rewards/chosen": 4.767716884613037, "rewards/margins": 4.059743881225586, "rewards/rejected": 0.7079731225967407, "step": 5715 }, { "epoch": 1.27, "learning_rate": 3.142191956876872e-06, "logits/chosen": -2.1435766220092773, "logits/rejected": -2.0094611644744873, "logps/chosen": -109.42562866210938, "logps/rejected": -95.57208251953125, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": 7.2103271484375, "rewards/margins": 1.9184556007385254, "rewards/rejected": 5.291871547698975, "step": 5716 }, { "epoch": 1.27, "learning_rate": 3.1405280640118085e-06, "logits/chosen": -2.0694687366485596, "logits/rejected": -1.9091014862060547, "logps/chosen": -92.29745483398438, "logps/rejected": -26.667821884155273, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 9.140759468078613, "rewards/margins": 7.555721759796143, "rewards/rejected": 1.5850378274917603, "step": 5717 }, { "epoch": 1.27, "learning_rate": 3.138864410084419e-06, "logits/chosen": -1.9228847026824951, "logits/rejected": -2.0007951259613037, "logps/chosen": -29.178691864013672, "logps/rejected": -135.30809020996094, "loss": 1.1359, "rewards/accuracies": 0.0, "rewards/chosen": 4.258864402770996, "rewards/margins": -2.13035249710083, "rewards/rejected": 6.389216899871826, "step": 5718 }, { "epoch": 1.27, "learning_rate": 3.1372009953084794e-06, "logits/chosen": -1.7783188819885254, "logits/rejected": -1.791623592376709, "logps/chosen": -28.80621337890625, "logps/rejected": -32.71437072753906, "loss": 0.3046, "rewards/accuracies": 1.0, "rewards/chosen": 3.3285484313964844, "rewards/margins": 0.5943984985351562, "rewards/rejected": 2.734149932861328, "step": 5719 }, { "epoch": 1.27, "learning_rate": 3.1355378198977325e-06, "logits/chosen": -1.7696669101715088, "logits/rejected": -1.7696669101715088, "logps/chosen": -57.799198150634766, "logps/rejected": -57.799198150634766, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 5.2320356369018555, "rewards/margins": 0.0, "rewards/rejected": 5.2320356369018555, "step": 5720 }, { "epoch": 1.27, "learning_rate": 3.133874884065895e-06, "logits/chosen": -1.6251825094223022, "logits/rejected": -1.5692623853683472, "logps/chosen": -33.09754943847656, "logps/rejected": -41.02254104614258, "loss": 0.8765, "rewards/accuracies": 0.0, "rewards/chosen": 2.758525848388672, "rewards/margins": -0.21443533897399902, "rewards/rejected": 2.972961187362671, "step": 5721 }, { "epoch": 1.27, "learning_rate": 3.1322121880266477e-06, "logits/chosen": -1.7438228130340576, "logits/rejected": -1.5721029043197632, "logps/chosen": -70.11299896240234, "logps/rejected": -33.25597381591797, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": 2.848623037338257, "rewards/margins": 2.9902679920196533, "rewards/rejected": -0.14164486527442932, "step": 5722 }, { "epoch": 1.27, "learning_rate": 3.130549731993645e-06, "logits/chosen": -1.6981873512268066, "logits/rejected": -1.7719241380691528, "logps/chosen": -88.67108917236328, "logps/rejected": -76.81636047363281, "loss": 0.194, "rewards/accuracies": 1.0, "rewards/chosen": 6.810607433319092, "rewards/margins": 0.7920055389404297, "rewards/rejected": 6.018601894378662, "step": 5723 }, { "epoch": 1.27, "learning_rate": 3.1288875161805076e-06, "logits/chosen": -2.0160531997680664, "logits/rejected": -2.038217544555664, "logps/chosen": -40.09884262084961, "logps/rejected": -48.58067321777344, "loss": 0.2698, "rewards/accuracies": 1.0, "rewards/chosen": 4.329692363739014, "rewards/margins": 0.6633455753326416, "rewards/rejected": 3.666346788406372, "step": 5724 }, { "epoch": 1.27, "learning_rate": 3.1272255408008267e-06, "logits/chosen": -1.9503836631774902, "logits/rejected": -1.8994406461715698, "logps/chosen": -44.94346618652344, "logps/rejected": -84.99883270263672, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": 4.930995941162109, "rewards/margins": 2.601942300796509, "rewards/rejected": 2.3290536403656006, "step": 5725 }, { "epoch": 1.27, "learning_rate": 3.1255638060681624e-06, "logits/chosen": -2.0422306060791016, "logits/rejected": -2.0409669876098633, "logps/chosen": -50.87283706665039, "logps/rejected": -129.0305938720703, "loss": 0.385, "rewards/accuracies": 1.0, "rewards/chosen": 8.232779502868652, "rewards/margins": 0.21074295043945312, "rewards/rejected": 8.0220365524292, "step": 5726 }, { "epoch": 1.27, "learning_rate": 3.123902312196041e-06, "logits/chosen": -1.9686026573181152, "logits/rejected": -2.0204110145568848, "logps/chosen": -29.382291793823242, "logps/rejected": -105.23783111572266, "loss": 2.2031, "rewards/accuracies": 0.0, "rewards/chosen": 3.5690407752990723, "rewards/margins": -4.045164585113525, "rewards/rejected": 7.614205360412598, "step": 5727 }, { "epoch": 1.27, "learning_rate": 3.1222410593979648e-06, "logits/chosen": -1.9887573719024658, "logits/rejected": -1.9875141382217407, "logps/chosen": -47.36262893676758, "logps/rejected": -43.060211181640625, "loss": 0.7513, "rewards/accuracies": 0.0, "rewards/chosen": 3.494682788848877, "rewards/margins": -0.9964261054992676, "rewards/rejected": 4.4911088943481445, "step": 5728 }, { "epoch": 1.27, "learning_rate": 3.1205800478873953e-06, "logits/chosen": -2.0039589405059814, "logits/rejected": -1.9735755920410156, "logps/chosen": -71.9130859375, "logps/rejected": -78.20536804199219, "loss": 0.0851, "rewards/accuracies": 1.0, "rewards/chosen": 6.4568772315979, "rewards/margins": 1.7874336242675781, "rewards/rejected": 4.669443607330322, "step": 5729 }, { "epoch": 1.27, "learning_rate": 3.1189192778777746e-06, "logits/chosen": -1.8393495082855225, "logits/rejected": -1.8371597528457642, "logps/chosen": -79.4958724975586, "logps/rejected": -111.51472473144531, "loss": 0.5808, "rewards/accuracies": 0.0, "rewards/chosen": 9.723697662353516, "rewards/margins": -0.7002420425415039, "rewards/rejected": 10.42393970489502, "step": 5730 }, { "epoch": 1.27, "learning_rate": 3.1172587495825024e-06, "logits/chosen": -2.1354682445526123, "logits/rejected": -2.1354682445526123, "logps/chosen": -62.26141357421875, "logps/rejected": -62.26141357421875, "loss": 0.3637, "rewards/accuracies": 0.0, "rewards/chosen": 11.518365859985352, "rewards/margins": 0.0, "rewards/rejected": 11.518365859985352, "step": 5731 }, { "epoch": 1.27, "learning_rate": 3.1155984632149565e-06, "logits/chosen": -1.845982313156128, "logits/rejected": -1.8650496006011963, "logps/chosen": -39.73451232910156, "logps/rejected": -63.894935607910156, "loss": 0.1432, "rewards/accuracies": 1.0, "rewards/chosen": 2.9706108570098877, "rewards/margins": 1.113294243812561, "rewards/rejected": 1.8573166131973267, "step": 5732 }, { "epoch": 1.27, "learning_rate": 3.113938418988478e-06, "logits/chosen": -1.8576747179031372, "logits/rejected": -1.8280531167984009, "logps/chosen": -46.32599639892578, "logps/rejected": -65.3459701538086, "loss": 0.3067, "rewards/accuracies": 1.0, "rewards/chosen": 3.9530982971191406, "rewards/margins": 0.1802046298980713, "rewards/rejected": 3.7728936672210693, "step": 5733 }, { "epoch": 1.27, "learning_rate": 3.1122786171163786e-06, "logits/chosen": -1.7883070707321167, "logits/rejected": -1.7549487352371216, "logps/chosen": -38.19285583496094, "logps/rejected": -63.709999084472656, "loss": 0.132, "rewards/accuracies": 1.0, "rewards/chosen": 4.197182655334473, "rewards/margins": 1.2248156070709229, "rewards/rejected": 2.97236704826355, "step": 5734 }, { "epoch": 1.27, "learning_rate": 3.11061905781194e-06, "logits/chosen": -1.9548090696334839, "logits/rejected": -1.9548090696334839, "logps/chosen": -44.60857009887695, "logps/rejected": -44.60857009887695, "loss": 0.348, "rewards/accuracies": 0.0, "rewards/chosen": 3.052454710006714, "rewards/margins": 0.0, "rewards/rejected": 3.052454710006714, "step": 5735 }, { "epoch": 1.27, "learning_rate": 3.1089597412884077e-06, "logits/chosen": -1.8017057180404663, "logits/rejected": -1.8206486701965332, "logps/chosen": -66.72802734375, "logps/rejected": -67.65029907226562, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 6.248687744140625, "rewards/margins": 1.6953177452087402, "rewards/rejected": 4.553369998931885, "step": 5736 }, { "epoch": 1.27, "learning_rate": 3.107300667759005e-06, "logits/chosen": -2.1638011932373047, "logits/rejected": -2.1623101234436035, "logps/chosen": -67.70330810546875, "logps/rejected": -98.67914581298828, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": 6.4530792236328125, "rewards/margins": 2.7134528160095215, "rewards/rejected": 3.739626407623291, "step": 5737 }, { "epoch": 1.27, "learning_rate": 3.1056418374369144e-06, "logits/chosen": -1.4777525663375854, "logits/rejected": -1.5106173753738403, "logps/chosen": -27.666847229003906, "logps/rejected": -64.39634704589844, "loss": 1.6276, "rewards/accuracies": 0.0, "rewards/chosen": 2.3136603832244873, "rewards/margins": -3.2046501636505127, "rewards/rejected": 5.518310546875, "step": 5738 }, { "epoch": 1.27, "learning_rate": 3.1039832505352963e-06, "logits/chosen": -1.9137417078018188, "logits/rejected": -1.912416934967041, "logps/chosen": -40.082740783691406, "logps/rejected": -18.48691177368164, "loss": 0.3623, "rewards/accuracies": 0.0, "rewards/chosen": 2.3029415607452393, "rewards/margins": -0.04762125015258789, "rewards/rejected": 2.350562810897827, "step": 5739 }, { "epoch": 1.27, "learning_rate": 3.10232490726727e-06, "logits/chosen": -1.7767772674560547, "logits/rejected": -1.7767772674560547, "logps/chosen": -27.99700927734375, "logps/rejected": -27.99700927734375, "loss": 1.7523, "rewards/accuracies": 0.0, "rewards/chosen": 4.210073947906494, "rewards/margins": 0.0, "rewards/rejected": 4.210073947906494, "step": 5740 }, { "epoch": 1.27, "learning_rate": 3.100666807845934e-06, "logits/chosen": -1.6532164812088013, "logits/rejected": -1.5270241498947144, "logps/chosen": -48.36406707763672, "logps/rejected": -12.959810256958008, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 4.692729949951172, "rewards/margins": 3.463824987411499, "rewards/rejected": 1.2289049625396729, "step": 5741 }, { "epoch": 1.27, "learning_rate": 3.099008952484346e-06, "logits/chosen": -2.129042148590088, "logits/rejected": -2.1011147499084473, "logps/chosen": -51.64495086669922, "logps/rejected": -23.337448120117188, "loss": 2.2728, "rewards/accuracies": 1.0, "rewards/chosen": 2.6482841968536377, "rewards/margins": 1.740002155303955, "rewards/rejected": 0.9082821011543274, "step": 5742 }, { "epoch": 1.27, "learning_rate": 3.097351341395538e-06, "logits/chosen": -2.0011377334594727, "logits/rejected": -1.9271461963653564, "logps/chosen": -92.30609130859375, "logps/rejected": -46.67611312866211, "loss": 0.1617, "rewards/accuracies": 1.0, "rewards/chosen": 8.081076622009277, "rewards/margins": 4.516098976135254, "rewards/rejected": 3.5649776458740234, "step": 5743 }, { "epoch": 1.27, "learning_rate": 3.0956939747925075e-06, "logits/chosen": -2.227987289428711, "logits/rejected": -2.210334062576294, "logps/chosen": -74.07759094238281, "logps/rejected": -49.708438873291016, "loss": 0.1624, "rewards/accuracies": 1.0, "rewards/chosen": 4.069304943084717, "rewards/margins": 1.0810561180114746, "rewards/rejected": 2.988248825073242, "step": 5744 }, { "epoch": 1.27, "learning_rate": 3.094036852888226e-06, "logits/chosen": -2.1114892959594727, "logits/rejected": -2.0803701877593994, "logps/chosen": -47.94083023071289, "logps/rejected": -64.6795654296875, "loss": 0.2942, "rewards/accuracies": 1.0, "rewards/chosen": 4.305126667022705, "rewards/margins": 0.26390504837036133, "rewards/rejected": 4.041221618652344, "step": 5745 }, { "epoch": 1.27, "learning_rate": 3.0923799758956265e-06, "logits/chosen": -1.8471083641052246, "logits/rejected": -1.8471083641052246, "logps/chosen": -4.877974987030029, "logps/rejected": -4.877974987030029, "loss": 0.4545, "rewards/accuracies": 0.0, "rewards/chosen": 1.4208399057388306, "rewards/margins": 0.0, "rewards/rejected": 1.4208399057388306, "step": 5746 }, { "epoch": 1.27, "learning_rate": 3.090723344027612e-06, "logits/chosen": -2.0796995162963867, "logits/rejected": -2.022763252258301, "logps/chosen": -53.63579177856445, "logps/rejected": -22.672752380371094, "loss": 0.3795, "rewards/accuracies": 1.0, "rewards/chosen": 3.284788131713867, "rewards/margins": 2.6887693405151367, "rewards/rejected": 0.5960187911987305, "step": 5747 }, { "epoch": 1.27, "learning_rate": 3.0890669574970598e-06, "logits/chosen": -1.8285866975784302, "logits/rejected": -1.7446696758270264, "logps/chosen": -88.06902313232422, "logps/rejected": -66.85155487060547, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": 7.9245171546936035, "rewards/margins": 3.161379814147949, "rewards/rejected": 4.763137340545654, "step": 5748 }, { "epoch": 1.27, "learning_rate": 3.0874108165168092e-06, "logits/chosen": -1.9180923700332642, "logits/rejected": -1.7337969541549683, "logps/chosen": -95.00016784667969, "logps/rejected": -41.29002380371094, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": 8.845503807067871, "rewards/margins": 4.1583333015441895, "rewards/rejected": 4.687170505523682, "step": 5749 }, { "epoch": 1.27, "learning_rate": 3.0857549212996705e-06, "logits/chosen": -2.3900060653686523, "logits/rejected": -2.3843464851379395, "logps/chosen": -77.29399108886719, "logps/rejected": -76.33657836914062, "loss": 0.2305, "rewards/accuracies": 1.0, "rewards/chosen": 9.13578987121582, "rewards/margins": 0.5965147018432617, "rewards/rejected": 8.539275169372559, "step": 5750 }, { "epoch": 1.27, "learning_rate": 3.084099272058424e-06, "logits/chosen": -1.7882508039474487, "logits/rejected": -1.6350117921829224, "logps/chosen": -38.58609390258789, "logps/rejected": -240.35708618164062, "loss": 0.8659, "rewards/accuracies": 0.0, "rewards/chosen": 7.578965663909912, "rewards/margins": -1.4696640968322754, "rewards/rejected": 9.048629760742188, "step": 5751 }, { "epoch": 1.27, "learning_rate": 3.0824438690058154e-06, "logits/chosen": -2.0960819721221924, "logits/rejected": -1.6525484323501587, "logps/chosen": -58.43401336669922, "logps/rejected": -72.46337890625, "loss": 0.1662, "rewards/accuracies": 1.0, "rewards/chosen": 4.134343147277832, "rewards/margins": 0.9667389392852783, "rewards/rejected": 3.1676042079925537, "step": 5752 }, { "epoch": 1.27, "learning_rate": 3.0807887123545578e-06, "logits/chosen": -2.037665605545044, "logits/rejected": -2.0445075035095215, "logps/chosen": -30.54586410522461, "logps/rejected": -69.77674865722656, "loss": 0.5341, "rewards/accuracies": 0.0, "rewards/chosen": 4.395233631134033, "rewards/margins": -0.6454372406005859, "rewards/rejected": 5.040670871734619, "step": 5753 }, { "epoch": 1.27, "learning_rate": 3.0791338023173395e-06, "logits/chosen": -1.8891913890838623, "logits/rejected": -1.7922245264053345, "logps/chosen": -54.28822708129883, "logps/rejected": -46.03266906738281, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": 6.225137233734131, "rewards/margins": 1.3466672897338867, "rewards/rejected": 4.878469944000244, "step": 5754 }, { "epoch": 1.27, "learning_rate": 3.077479139106809e-06, "logits/chosen": -1.5543506145477295, "logits/rejected": -1.4627628326416016, "logps/chosen": -50.191524505615234, "logps/rejected": -30.90658950805664, "loss": 0.2397, "rewards/accuracies": 1.0, "rewards/chosen": 2.601989507675171, "rewards/margins": 0.5402965545654297, "rewards/rejected": 2.061692953109741, "step": 5755 }, { "epoch": 1.27, "learning_rate": 3.075824722935587e-06, "logits/chosen": -1.9815478324890137, "logits/rejected": -2.027775287628174, "logps/chosen": -48.781551361083984, "logps/rejected": -123.3387451171875, "loss": 2.0644, "rewards/accuracies": 0.0, "rewards/chosen": 3.159987211227417, "rewards/margins": -3.814404249191284, "rewards/rejected": 6.974391460418701, "step": 5756 }, { "epoch": 1.27, "learning_rate": 3.0741705540162637e-06, "logits/chosen": -1.9058396816253662, "logits/rejected": -1.9327489137649536, "logps/chosen": -43.356571197509766, "logps/rejected": -79.80132293701172, "loss": 0.2975, "rewards/accuracies": 1.0, "rewards/chosen": 4.4776434898376465, "rewards/margins": 0.4914557933807373, "rewards/rejected": 3.986187696456909, "step": 5757 }, { "epoch": 1.27, "learning_rate": 3.0725166325613935e-06, "logits/chosen": -1.6929274797439575, "logits/rejected": -1.700180172920227, "logps/chosen": -26.715465545654297, "logps/rejected": -53.28874588012695, "loss": 0.9617, "rewards/accuracies": 0.0, "rewards/chosen": 1.5696239471435547, "rewards/margins": -1.5117332935333252, "rewards/rejected": 3.08135724067688, "step": 5758 }, { "epoch": 1.27, "learning_rate": 3.0708629587835036e-06, "logits/chosen": -1.9684964418411255, "logits/rejected": -1.7730576992034912, "logps/chosen": -41.04296875, "logps/rejected": -177.14569091796875, "loss": 1.0826, "rewards/accuracies": 0.0, "rewards/chosen": 4.957083225250244, "rewards/margins": -1.9948883056640625, "rewards/rejected": 6.951971530914307, "step": 5759 }, { "epoch": 1.27, "learning_rate": 3.0692095328950843e-06, "logits/chosen": -1.887413740158081, "logits/rejected": -1.905281662940979, "logps/chosen": -47.6771240234375, "logps/rejected": -96.22303771972656, "loss": 0.2804, "rewards/accuracies": 1.0, "rewards/chosen": 3.895117998123169, "rewards/margins": 0.3504753112792969, "rewards/rejected": 3.544642686843872, "step": 5760 }, { "epoch": 1.28, "learning_rate": 3.0675563551086e-06, "logits/chosen": -2.4548940658569336, "logits/rejected": -1.6474685668945312, "logps/chosen": -74.01573181152344, "logps/rejected": -62.09294891357422, "loss": 0.1534, "rewards/accuracies": 1.0, "rewards/chosen": 4.971991062164307, "rewards/margins": 1.3463280200958252, "rewards/rejected": 3.6256630420684814, "step": 5761 }, { "epoch": 1.28, "learning_rate": 3.0659034256364773e-06, "logits/chosen": -1.8667746782302856, "logits/rejected": -1.857531189918518, "logps/chosen": -61.96942901611328, "logps/rejected": -91.26634979248047, "loss": 0.9385, "rewards/accuracies": 0.0, "rewards/chosen": 4.640039920806885, "rewards/margins": -1.4834380149841309, "rewards/rejected": 6.123477935791016, "step": 5762 }, { "epoch": 1.28, "learning_rate": 3.0642507446911163e-06, "logits/chosen": -1.6456413269042969, "logits/rejected": -1.5272821187973022, "logps/chosen": -84.45870208740234, "logps/rejected": -43.9649543762207, "loss": 0.0551, "rewards/accuracies": 1.0, "rewards/chosen": 5.041823863983154, "rewards/margins": 2.939561367034912, "rewards/rejected": 2.102262496948242, "step": 5763 }, { "epoch": 1.28, "learning_rate": 3.0625983124848797e-06, "logits/chosen": -1.8396131992340088, "logits/rejected": -1.8181520700454712, "logps/chosen": -49.90929412841797, "logps/rejected": -80.52317810058594, "loss": 0.1314, "rewards/accuracies": 1.0, "rewards/chosen": 3.411418914794922, "rewards/margins": 1.4751724004745483, "rewards/rejected": 1.9362465143203735, "step": 5764 }, { "epoch": 1.28, "learning_rate": 3.0609461292301047e-06, "logits/chosen": -2.125523567199707, "logits/rejected": -2.084686040878296, "logps/chosen": -81.51556396484375, "logps/rejected": -77.1134262084961, "loss": 0.1353, "rewards/accuracies": 1.0, "rewards/chosen": 6.333136081695557, "rewards/margins": 1.1781625747680664, "rewards/rejected": 5.15497350692749, "step": 5765 }, { "epoch": 1.28, "learning_rate": 3.05929419513909e-06, "logits/chosen": -2.201385259628296, "logits/rejected": -2.0859475135803223, "logps/chosen": -91.70987701416016, "logps/rejected": -61.24651336669922, "loss": 0.333, "rewards/accuracies": 1.0, "rewards/chosen": 5.837123870849609, "rewards/margins": 6.130337715148926, "rewards/rejected": -0.2932136654853821, "step": 5766 }, { "epoch": 1.28, "learning_rate": 3.0576425104241047e-06, "logits/chosen": -1.766338586807251, "logits/rejected": -1.8345414400100708, "logps/chosen": -22.035383224487305, "logps/rejected": -62.37153625488281, "loss": 0.4764, "rewards/accuracies": 0.0, "rewards/chosen": 3.351092576980591, "rewards/margins": -0.4285728931427002, "rewards/rejected": 3.779665470123291, "step": 5767 }, { "epoch": 1.28, "learning_rate": 3.05599107529739e-06, "logits/chosen": -1.8224958181381226, "logits/rejected": -1.7672609090805054, "logps/chosen": -56.375797271728516, "logps/rejected": -52.474884033203125, "loss": 1.3489, "rewards/accuracies": 1.0, "rewards/chosen": 3.4269611835479736, "rewards/margins": 1.9242305755615234, "rewards/rejected": 1.5027306079864502, "step": 5768 }, { "epoch": 1.28, "learning_rate": 3.054339889971145e-06, "logits/chosen": -1.9322657585144043, "logits/rejected": -1.943434238433838, "logps/chosen": -73.82486724853516, "logps/rejected": -43.7066764831543, "loss": 0.3231, "rewards/accuracies": 1.0, "rewards/chosen": 3.2361276149749756, "rewards/margins": 0.15574932098388672, "rewards/rejected": 3.080378293991089, "step": 5769 }, { "epoch": 1.28, "learning_rate": 3.05268895465755e-06, "logits/chosen": -1.7509467601776123, "logits/rejected": -1.7922451496124268, "logps/chosen": -19.437850952148438, "logps/rejected": -46.70695114135742, "loss": 1.2966, "rewards/accuracies": 0.0, "rewards/chosen": 2.181293249130249, "rewards/margins": -2.432267904281616, "rewards/rejected": 4.613561153411865, "step": 5770 }, { "epoch": 1.28, "learning_rate": 3.0510382695687414e-06, "logits/chosen": -1.7227052450180054, "logits/rejected": -1.715608835220337, "logps/chosen": -50.170005798339844, "logps/rejected": -82.3027114868164, "loss": 0.0639, "rewards/accuracies": 1.0, "rewards/chosen": 4.055406093597412, "rewards/margins": 2.3101096153259277, "rewards/rejected": 1.7452964782714844, "step": 5771 }, { "epoch": 1.28, "learning_rate": 3.0493878349168315e-06, "logits/chosen": -1.7505555152893066, "logits/rejected": -1.7484872341156006, "logps/chosen": -47.564144134521484, "logps/rejected": -72.38534545898438, "loss": 2.9286, "rewards/accuracies": 1.0, "rewards/chosen": 3.6523518562316895, "rewards/margins": 0.330122709274292, "rewards/rejected": 3.3222291469573975, "step": 5772 }, { "epoch": 1.28, "learning_rate": 3.0477376509138935e-06, "logits/chosen": -2.0109236240386963, "logits/rejected": -2.0121381282806396, "logps/chosen": -58.0534782409668, "logps/rejected": -58.735313415527344, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": 4.326477527618408, "rewards/margins": 1.1900365352630615, "rewards/rejected": 3.1364409923553467, "step": 5773 }, { "epoch": 1.28, "learning_rate": 3.0460877177719763e-06, "logits/chosen": -2.2392220497131348, "logits/rejected": -2.163712739944458, "logps/chosen": -89.4120864868164, "logps/rejected": -92.52275848388672, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 7.695040225982666, "rewards/margins": 2.3489303588867188, "rewards/rejected": 5.346109867095947, "step": 5774 }, { "epoch": 1.28, "learning_rate": 3.0444380357030907e-06, "logits/chosen": -1.9885988235473633, "logits/rejected": -1.9816688299179077, "logps/chosen": -46.64559555053711, "logps/rejected": -38.367523193359375, "loss": 0.1722, "rewards/accuracies": 1.0, "rewards/chosen": 2.4454853534698486, "rewards/margins": 0.9221447706222534, "rewards/rejected": 1.5233405828475952, "step": 5775 }, { "epoch": 1.28, "learning_rate": 3.042788604919215e-06, "logits/chosen": -1.9428101778030396, "logits/rejected": -1.9298323392868042, "logps/chosen": -43.850704193115234, "logps/rejected": -71.81663513183594, "loss": 0.3119, "rewards/accuracies": 1.0, "rewards/chosen": 4.586817741394043, "rewards/margins": 0.5737013816833496, "rewards/rejected": 4.013116359710693, "step": 5776 }, { "epoch": 1.28, "learning_rate": 3.0411394256323e-06, "logits/chosen": -1.932665467262268, "logits/rejected": -1.828226089477539, "logps/chosen": -55.210391998291016, "logps/rejected": -32.253150939941406, "loss": 0.3365, "rewards/accuracies": 1.0, "rewards/chosen": 3.4177029132843018, "rewards/margins": 2.999023914337158, "rewards/rejected": 0.41867905855178833, "step": 5777 }, { "epoch": 1.28, "learning_rate": 3.039490498054257e-06, "logits/chosen": -1.820715069770813, "logits/rejected": -1.802783489227295, "logps/chosen": -49.87255859375, "logps/rejected": -82.6644515991211, "loss": 0.0672, "rewards/accuracies": 1.0, "rewards/chosen": 4.606941223144531, "rewards/margins": 2.0706961154937744, "rewards/rejected": 2.536245107650757, "step": 5778 }, { "epoch": 1.28, "learning_rate": 3.0378418223969754e-06, "logits/chosen": -2.412445545196533, "logits/rejected": -2.2769739627838135, "logps/chosen": -18.783157348632812, "logps/rejected": -147.60516357421875, "loss": 3.1975, "rewards/accuracies": 0.0, "rewards/chosen": 3.808025360107422, "rewards/margins": -6.355993270874023, "rewards/rejected": 10.164018630981445, "step": 5779 }, { "epoch": 1.28, "learning_rate": 3.0361933988722998e-06, "logits/chosen": -1.8666266202926636, "logits/rejected": -1.662880539894104, "logps/chosen": -125.61441040039062, "logps/rejected": -17.87814712524414, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 6.899566650390625, "rewards/margins": 6.594128131866455, "rewards/rejected": 0.3054386079311371, "step": 5780 }, { "epoch": 1.28, "learning_rate": 3.034545227692054e-06, "logits/chosen": -2.091911792755127, "logits/rejected": -1.9904570579528809, "logps/chosen": -87.71308135986328, "logps/rejected": -56.24204635620117, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 6.3853936195373535, "rewards/margins": 4.2051239013671875, "rewards/rejected": 2.180269956588745, "step": 5781 }, { "epoch": 1.28, "learning_rate": 3.0328973090680202e-06, "logits/chosen": -1.730610728263855, "logits/rejected": -1.7172924280166626, "logps/chosen": -65.50514221191406, "logps/rejected": -93.3668212890625, "loss": 0.3722, "rewards/accuracies": 1.0, "rewards/chosen": 4.430656433105469, "rewards/margins": 0.5488569736480713, "rewards/rejected": 3.8817994594573975, "step": 5782 }, { "epoch": 1.28, "learning_rate": 3.031249643211953e-06, "logits/chosen": -2.010340929031372, "logits/rejected": -2.0019948482513428, "logps/chosen": -105.36976623535156, "logps/rejected": -121.00332641601562, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 10.60278034210205, "rewards/margins": 3.6197128295898438, "rewards/rejected": 6.983067512512207, "step": 5783 }, { "epoch": 1.28, "learning_rate": 3.029602230335575e-06, "logits/chosen": -1.7549415826797485, "logits/rejected": -1.71468186378479, "logps/chosen": -45.354454040527344, "logps/rejected": -99.38375091552734, "loss": 0.142, "rewards/accuracies": 1.0, "rewards/chosen": 5.497687816619873, "rewards/margins": 1.13093900680542, "rewards/rejected": 4.366748809814453, "step": 5784 }, { "epoch": 1.28, "learning_rate": 3.0279550706505746e-06, "logits/chosen": -1.8553247451782227, "logits/rejected": -1.8379966020584106, "logps/chosen": -37.20267868041992, "logps/rejected": -43.73042297363281, "loss": 0.3644, "rewards/accuracies": 0.0, "rewards/chosen": 3.8652851581573486, "rewards/margins": -0.016706466674804688, "rewards/rejected": 3.8819916248321533, "step": 5785 }, { "epoch": 1.28, "learning_rate": 3.0263081643686064e-06, "logits/chosen": -2.0510201454162598, "logits/rejected": -2.0796196460723877, "logps/chosen": -37.50455093383789, "logps/rejected": -33.26641082763672, "loss": 0.383, "rewards/accuracies": 1.0, "rewards/chosen": 3.3097500801086426, "rewards/margins": 0.3976459503173828, "rewards/rejected": 2.9121041297912598, "step": 5786 }, { "epoch": 1.28, "learning_rate": 3.0246615117012923e-06, "logits/chosen": -2.160843849182129, "logits/rejected": -2.1149704456329346, "logps/chosen": -74.21697998046875, "logps/rejected": -22.908477783203125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 5.9101152420043945, "rewards/margins": 5.2339277267456055, "rewards/rejected": 0.6761875152587891, "step": 5787 }, { "epoch": 1.28, "learning_rate": 3.023015112860228e-06, "logits/chosen": -1.8610167503356934, "logits/rejected": -1.8610167503356934, "logps/chosen": -30.00577735900879, "logps/rejected": -30.00577735900879, "loss": 0.3864, "rewards/accuracies": 0.0, "rewards/chosen": 3.460817813873291, "rewards/margins": 0.0, "rewards/rejected": 3.460817813873291, "step": 5788 }, { "epoch": 1.28, "learning_rate": 3.021368968056967e-06, "logits/chosen": -2.0038387775421143, "logits/rejected": -1.9771859645843506, "logps/chosen": -34.902957916259766, "logps/rejected": -71.3125228881836, "loss": 0.1876, "rewards/accuracies": 1.0, "rewards/chosen": 3.9388439655303955, "rewards/margins": 1.4960055351257324, "rewards/rejected": 2.442838430404663, "step": 5789 }, { "epoch": 1.28, "learning_rate": 3.0197230775030383e-06, "logits/chosen": -1.7838456630706787, "logits/rejected": -1.7752951383590698, "logps/chosen": -18.657316207885742, "logps/rejected": -42.491180419921875, "loss": 1.1151, "rewards/accuracies": 1.0, "rewards/chosen": 1.7852586507797241, "rewards/margins": 1.0909082889556885, "rewards/rejected": 0.6943504214286804, "step": 5790 }, { "epoch": 1.28, "learning_rate": 3.018077441409933e-06, "logits/chosen": -1.762843132019043, "logits/rejected": -1.7572715282440186, "logps/chosen": -54.329933166503906, "logps/rejected": -43.45346450805664, "loss": 0.2039, "rewards/accuracies": 1.0, "rewards/chosen": 4.641215801239014, "rewards/margins": 1.3409125804901123, "rewards/rejected": 3.3003032207489014, "step": 5791 }, { "epoch": 1.28, "learning_rate": 3.016432059989113e-06, "logits/chosen": -2.5170845985412598, "logits/rejected": -2.5109145641326904, "logps/chosen": -18.77304458618164, "logps/rejected": -20.570796966552734, "loss": 0.1545, "rewards/accuracies": 1.0, "rewards/chosen": 2.3221874237060547, "rewards/margins": 1.274328589439392, "rewards/rejected": 1.0478588342666626, "step": 5792 }, { "epoch": 1.28, "learning_rate": 3.014786933452004e-06, "logits/chosen": -2.103929042816162, "logits/rejected": -2.023798942565918, "logps/chosen": -74.26808166503906, "logps/rejected": -85.93115997314453, "loss": 0.2164, "rewards/accuracies": 1.0, "rewards/chosen": 5.754094123840332, "rewards/margins": 0.7310023307800293, "rewards/rejected": 5.023091793060303, "step": 5793 }, { "epoch": 1.28, "learning_rate": 3.0131420620100034e-06, "logits/chosen": -1.9216349124908447, "logits/rejected": -1.9031906127929688, "logps/chosen": -49.600990295410156, "logps/rejected": -56.4249382019043, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 3.9070167541503906, "rewards/margins": 1.4564259052276611, "rewards/rejected": 2.4505908489227295, "step": 5794 }, { "epoch": 1.28, "learning_rate": 3.0114974458744694e-06, "logits/chosen": -1.8178952932357788, "logits/rejected": -1.7624205350875854, "logps/chosen": -84.4727554321289, "logps/rejected": -89.2413330078125, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 7.40511417388916, "rewards/margins": 3.1903281211853027, "rewards/rejected": 4.214786052703857, "step": 5795 }, { "epoch": 1.28, "learning_rate": 3.009853085256735e-06, "logits/chosen": -1.951878547668457, "logits/rejected": -1.951878547668457, "logps/chosen": -33.45665740966797, "logps/rejected": -33.45665740966797, "loss": 0.35, "rewards/accuracies": 0.0, "rewards/chosen": 3.1283957958221436, "rewards/margins": 0.0, "rewards/rejected": 3.1283957958221436, "step": 5796 }, { "epoch": 1.28, "learning_rate": 3.008208980368095e-06, "logits/chosen": -1.769871473312378, "logits/rejected": -1.7102936506271362, "logps/chosen": -23.10826873779297, "logps/rejected": -26.392271041870117, "loss": 0.2569, "rewards/accuracies": 1.0, "rewards/chosen": 2.730245590209961, "rewards/margins": 1.3363053798675537, "rewards/rejected": 1.3939402103424072, "step": 5797 }, { "epoch": 1.28, "learning_rate": 3.0065651314198107e-06, "logits/chosen": -1.9515817165374756, "logits/rejected": -1.9410468339920044, "logps/chosen": -67.18405151367188, "logps/rejected": -113.90670776367188, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 4.466526985168457, "rewards/margins": 1.687835931777954, "rewards/rejected": 2.778691053390503, "step": 5798 }, { "epoch": 1.28, "learning_rate": 3.004921538623118e-06, "logits/chosen": -2.0789763927459717, "logits/rejected": -2.128058671951294, "logps/chosen": -75.06332397460938, "logps/rejected": -150.17657470703125, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 10.233738899230957, "rewards/margins": 1.5162372589111328, "rewards/rejected": 8.717501640319824, "step": 5799 }, { "epoch": 1.28, "learning_rate": 3.0032782021892094e-06, "logits/chosen": -2.1699841022491455, "logits/rejected": -2.139916181564331, "logps/chosen": -79.81997680664062, "logps/rejected": -46.7354736328125, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": 7.018121242523193, "rewards/margins": 2.561845302581787, "rewards/rejected": 4.456275939941406, "step": 5800 }, { "epoch": 1.28, "learning_rate": 3.0016351223292535e-06, "logits/chosen": -2.005963087081909, "logits/rejected": -1.9160692691802979, "logps/chosen": -36.770118713378906, "logps/rejected": -71.87542724609375, "loss": 0.4578, "rewards/accuracies": 1.0, "rewards/chosen": 4.00574254989624, "rewards/margins": 0.6712005138397217, "rewards/rejected": 3.3345420360565186, "step": 5801 }, { "epoch": 1.28, "learning_rate": 2.9999922992543777e-06, "logits/chosen": -1.9782737493515015, "logits/rejected": -1.865229606628418, "logps/chosen": -122.25120544433594, "logps/rejected": -55.29717254638672, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": 6.613591194152832, "rewards/margins": 3.364515781402588, "rewards/rejected": 3.249075412750244, "step": 5802 }, { "epoch": 1.28, "learning_rate": 2.998349733175686e-06, "logits/chosen": -1.63539457321167, "logits/rejected": -1.63539457321167, "logps/chosen": -30.334678649902344, "logps/rejected": -30.334678649902344, "loss": 0.3605, "rewards/accuracies": 0.0, "rewards/chosen": 6.924514293670654, "rewards/margins": 0.0, "rewards/rejected": 6.924514293670654, "step": 5803 }, { "epoch": 1.28, "learning_rate": 2.9967074243042405e-06, "logits/chosen": -1.5444004535675049, "logits/rejected": -1.5444004535675049, "logps/chosen": -43.29185485839844, "logps/rejected": -43.29185485839844, "loss": 0.5388, "rewards/accuracies": 0.0, "rewards/chosen": 2.9532439708709717, "rewards/margins": 0.0, "rewards/rejected": 2.9532439708709717, "step": 5804 }, { "epoch": 1.28, "learning_rate": 2.995065372851076e-06, "logits/chosen": -1.5743178129196167, "logits/rejected": -1.5630937814712524, "logps/chosen": -32.26520538330078, "logps/rejected": -52.48916244506836, "loss": 0.1286, "rewards/accuracies": 1.0, "rewards/chosen": 4.0191216468811035, "rewards/margins": 1.6389820575714111, "rewards/rejected": 2.3801395893096924, "step": 5805 }, { "epoch": 1.29, "learning_rate": 2.993423579027192e-06, "logits/chosen": -1.8602362871170044, "logits/rejected": -1.757135033607483, "logps/chosen": -85.6492919921875, "logps/rejected": -76.06436920166016, "loss": 0.0745, "rewards/accuracies": 1.0, "rewards/chosen": 6.847827434539795, "rewards/margins": 2.0715370178222656, "rewards/rejected": 4.776290416717529, "step": 5806 }, { "epoch": 1.29, "learning_rate": 2.9917820430435524e-06, "logits/chosen": -2.3987536430358887, "logits/rejected": -2.3160040378570557, "logps/chosen": -133.76089477539062, "logps/rejected": -33.00557327270508, "loss": 0.0734, "rewards/accuracies": 1.0, "rewards/chosen": 4.883670330047607, "rewards/margins": 1.852355718612671, "rewards/rejected": 3.0313146114349365, "step": 5807 }, { "epoch": 1.29, "learning_rate": 2.9901407651110947e-06, "logits/chosen": -2.1490135192871094, "logits/rejected": -2.1066601276397705, "logps/chosen": -135.64218139648438, "logps/rejected": -60.81333923339844, "loss": 0.0652, "rewards/accuracies": 1.0, "rewards/chosen": 6.241549968719482, "rewards/margins": 2.5821292400360107, "rewards/rejected": 3.6594207286834717, "step": 5808 }, { "epoch": 1.29, "learning_rate": 2.9884997454407156e-06, "logits/chosen": -1.832969307899475, "logits/rejected": -1.8205102682113647, "logps/chosen": -30.82244300842285, "logps/rejected": -95.46452331542969, "loss": 0.2733, "rewards/accuracies": 1.0, "rewards/chosen": 4.229743480682373, "rewards/margins": 0.3383448123931885, "rewards/rejected": 3.8913986682891846, "step": 5809 }, { "epoch": 1.29, "learning_rate": 2.986858984243285e-06, "logits/chosen": -1.8004521131515503, "logits/rejected": -1.7506016492843628, "logps/chosen": -103.58899688720703, "logps/rejected": -25.521394729614258, "loss": 0.0646, "rewards/accuracies": 1.0, "rewards/chosen": 2.9564995765686035, "rewards/margins": 2.8030028343200684, "rewards/rejected": 0.15349674224853516, "step": 5810 }, { "epoch": 1.29, "learning_rate": 2.9852184817296336e-06, "logits/chosen": -1.763318657875061, "logits/rejected": -1.5834707021713257, "logps/chosen": -74.49842834472656, "logps/rejected": -57.85371398925781, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 5.154763221740723, "rewards/margins": 4.488384246826172, "rewards/rejected": 0.6663787961006165, "step": 5811 }, { "epoch": 1.29, "learning_rate": 2.983578238110566e-06, "logits/chosen": -2.017979621887207, "logits/rejected": -1.9746326208114624, "logps/chosen": -33.042381286621094, "logps/rejected": -75.68630981445312, "loss": 0.8163, "rewards/accuracies": 0.0, "rewards/chosen": 5.582470893859863, "rewards/margins": -0.6862301826477051, "rewards/rejected": 6.268701076507568, "step": 5812 }, { "epoch": 1.29, "learning_rate": 2.9819382535968448e-06, "logits/chosen": -2.202993392944336, "logits/rejected": -2.19089412689209, "logps/chosen": -82.00843048095703, "logps/rejected": -64.19868469238281, "loss": 0.1384, "rewards/accuracies": 1.0, "rewards/chosen": 6.200387001037598, "rewards/margins": 2.6891281604766846, "rewards/rejected": 3.511258840560913, "step": 5813 }, { "epoch": 1.29, "learning_rate": 2.980298528399209e-06, "logits/chosen": -1.7129040956497192, "logits/rejected": -1.7278821468353271, "logps/chosen": -30.196073532104492, "logps/rejected": -50.631805419921875, "loss": 0.6696, "rewards/accuracies": 0.0, "rewards/chosen": 2.2721750736236572, "rewards/margins": -0.7816793918609619, "rewards/rejected": 3.053854465484619, "step": 5814 }, { "epoch": 1.29, "learning_rate": 2.9786590627283568e-06, "logits/chosen": -1.9866353273391724, "logits/rejected": -1.8591822385787964, "logps/chosen": -144.38330078125, "logps/rejected": -34.51359558105469, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": 6.2079362869262695, "rewards/margins": 4.971053600311279, "rewards/rejected": 1.2368828058242798, "step": 5815 }, { "epoch": 1.29, "learning_rate": 2.977019856794955e-06, "logits/chosen": -2.107252836227417, "logits/rejected": -1.8573719263076782, "logps/chosen": -109.07801818847656, "logps/rejected": -27.855587005615234, "loss": 0.2825, "rewards/accuracies": 1.0, "rewards/chosen": 7.053013801574707, "rewards/margins": 5.616917610168457, "rewards/rejected": 1.43609619140625, "step": 5816 }, { "epoch": 1.29, "learning_rate": 2.9753809108096397e-06, "logits/chosen": -1.886906623840332, "logits/rejected": -1.98183274269104, "logps/chosen": -73.6959228515625, "logps/rejected": -122.85107421875, "loss": 0.2155, "rewards/accuracies": 1.0, "rewards/chosen": 6.106625556945801, "rewards/margins": 1.6077532768249512, "rewards/rejected": 4.49887228012085, "step": 5817 }, { "epoch": 1.29, "learning_rate": 2.9737422249830095e-06, "logits/chosen": -2.120159387588501, "logits/rejected": -2.126291275024414, "logps/chosen": -27.94090461730957, "logps/rejected": -116.947509765625, "loss": 0.1561, "rewards/accuracies": 1.0, "rewards/chosen": 3.602891683578491, "rewards/margins": 1.0202577114105225, "rewards/rejected": 2.5826339721679688, "step": 5818 }, { "epoch": 1.29, "learning_rate": 2.9721037995256337e-06, "logits/chosen": -1.793679118156433, "logits/rejected": -1.7898955345153809, "logps/chosen": -51.104820251464844, "logps/rejected": -75.67474365234375, "loss": 0.2455, "rewards/accuracies": 1.0, "rewards/chosen": 2.8884408473968506, "rewards/margins": 0.4861564636230469, "rewards/rejected": 2.4022843837738037, "step": 5819 }, { "epoch": 1.29, "learning_rate": 2.970465634648042e-06, "logits/chosen": -2.015993595123291, "logits/rejected": -1.9809664487838745, "logps/chosen": -36.4200553894043, "logps/rejected": -53.538970947265625, "loss": 0.2116, "rewards/accuracies": 1.0, "rewards/chosen": 4.668712139129639, "rewards/margins": 1.3734843730926514, "rewards/rejected": 3.2952277660369873, "step": 5820 }, { "epoch": 1.29, "learning_rate": 2.96882773056074e-06, "logits/chosen": -1.836859107017517, "logits/rejected": -1.709439754486084, "logps/chosen": -38.01374816894531, "logps/rejected": -7.557054042816162, "loss": 0.6296, "rewards/accuracies": 1.0, "rewards/chosen": 2.721123456954956, "rewards/margins": 1.623807668685913, "rewards/rejected": 1.097315788269043, "step": 5821 }, { "epoch": 1.29, "learning_rate": 2.96719008747419e-06, "logits/chosen": -2.0820398330688477, "logits/rejected": -2.0461442470550537, "logps/chosen": -141.44033813476562, "logps/rejected": -60.5043830871582, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 8.396888732910156, "rewards/margins": 4.55223274230957, "rewards/rejected": 3.844655990600586, "step": 5822 }, { "epoch": 1.29, "learning_rate": 2.965552705598828e-06, "logits/chosen": -1.8319278955459595, "logits/rejected": -1.7939058542251587, "logps/chosen": -38.342041015625, "logps/rejected": -36.46558380126953, "loss": 0.2982, "rewards/accuracies": 1.0, "rewards/chosen": 4.460259914398193, "rewards/margins": 0.3776693344116211, "rewards/rejected": 4.082590579986572, "step": 5823 }, { "epoch": 1.29, "learning_rate": 2.9639155851450525e-06, "logits/chosen": -2.0570385456085205, "logits/rejected": -2.0509817600250244, "logps/chosen": -23.417877197265625, "logps/rejected": -37.56105041503906, "loss": 0.9054, "rewards/accuracies": 0.0, "rewards/chosen": 4.217704772949219, "rewards/margins": -1.3288445472717285, "rewards/rejected": 5.546549320220947, "step": 5824 }, { "epoch": 1.29, "learning_rate": 2.9622787263232296e-06, "logits/chosen": -1.818304181098938, "logits/rejected": -1.8424005508422852, "logps/chosen": -10.819077491760254, "logps/rejected": -52.903602600097656, "loss": 0.4127, "rewards/accuracies": 1.0, "rewards/chosen": 1.5771574974060059, "rewards/margins": 0.016844868659973145, "rewards/rejected": 1.5603126287460327, "step": 5825 }, { "epoch": 1.29, "learning_rate": 2.960642129343693e-06, "logits/chosen": -1.7944213151931763, "logits/rejected": -1.8230124711990356, "logps/chosen": -27.300613403320312, "logps/rejected": -21.226848602294922, "loss": 0.3637, "rewards/accuracies": 1.0, "rewards/chosen": 1.8815284967422485, "rewards/margins": 0.21360433101654053, "rewards/rejected": 1.667924165725708, "step": 5826 }, { "epoch": 1.29, "learning_rate": 2.959005794416738e-06, "logits/chosen": -1.8189197778701782, "logits/rejected": -1.7435802221298218, "logps/chosen": -48.25365447998047, "logps/rejected": -26.226879119873047, "loss": 0.7646, "rewards/accuracies": 1.0, "rewards/chosen": 4.401149749755859, "rewards/margins": 3.0931901931762695, "rewards/rejected": 1.3079594373703003, "step": 5827 }, { "epoch": 1.29, "learning_rate": 2.957369721752634e-06, "logits/chosen": -1.5880866050720215, "logits/rejected": -1.6404486894607544, "logps/chosen": -38.14796447753906, "logps/rejected": -63.25834655761719, "loss": 0.5643, "rewards/accuracies": 0.0, "rewards/chosen": 4.33504056930542, "rewards/margins": -0.5245251655578613, "rewards/rejected": 4.859565734863281, "step": 5828 }, { "epoch": 1.29, "learning_rate": 2.955733911561609e-06, "logits/chosen": -2.174805164337158, "logits/rejected": -2.194524049758911, "logps/chosen": -78.84199523925781, "logps/rejected": -133.5985107421875, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": 10.184279441833496, "rewards/margins": 2.471193313598633, "rewards/rejected": 7.713086128234863, "step": 5829 }, { "epoch": 1.29, "learning_rate": 2.9540983640538635e-06, "logits/chosen": -1.9325361251831055, "logits/rejected": -1.826501488685608, "logps/chosen": -135.08489990234375, "logps/rejected": -104.43865966796875, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 8.6964111328125, "rewards/margins": 5.896066188812256, "rewards/rejected": 2.800344944000244, "step": 5830 }, { "epoch": 1.29, "learning_rate": 2.9524630794395577e-06, "logits/chosen": -2.1797590255737305, "logits/rejected": -2.102597713470459, "logps/chosen": -41.42451858520508, "logps/rejected": -39.40035629272461, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": 5.426191329956055, "rewards/margins": 3.4678122997283936, "rewards/rejected": 1.9583790302276611, "step": 5831 }, { "epoch": 1.29, "learning_rate": 2.950828057928826e-06, "logits/chosen": -1.7652430534362793, "logits/rejected": -1.7652430534362793, "logps/chosen": -6.6120686531066895, "logps/rejected": -6.6120686531066895, "loss": 0.4628, "rewards/accuracies": 0.0, "rewards/chosen": 2.38202166557312, "rewards/margins": 0.0, "rewards/rejected": 2.38202166557312, "step": 5832 }, { "epoch": 1.29, "learning_rate": 2.9491932997317624e-06, "logits/chosen": -1.8423429727554321, "logits/rejected": -1.7812919616699219, "logps/chosen": -33.30885314941406, "logps/rejected": -39.243324279785156, "loss": 0.1414, "rewards/accuracies": 1.0, "rewards/chosen": 2.749312162399292, "rewards/margins": 1.9691706895828247, "rewards/rejected": 0.7801414728164673, "step": 5833 }, { "epoch": 1.29, "learning_rate": 2.9475588050584304e-06, "logits/chosen": -2.1986989974975586, "logits/rejected": -2.1837189197540283, "logps/chosen": -52.7374382019043, "logps/rejected": -44.900123596191406, "loss": 0.0876, "rewards/accuracies": 1.0, "rewards/chosen": 5.49420690536499, "rewards/margins": 1.7552573680877686, "rewards/rejected": 3.7389495372772217, "step": 5834 }, { "epoch": 1.29, "learning_rate": 2.9459245741188557e-06, "logits/chosen": -1.8590145111083984, "logits/rejected": -1.864155888557434, "logps/chosen": -35.80306625366211, "logps/rejected": -47.55323028564453, "loss": 1.6357, "rewards/accuracies": 1.0, "rewards/chosen": 4.390367031097412, "rewards/margins": 1.129030466079712, "rewards/rejected": 3.2613365650177, "step": 5835 }, { "epoch": 1.29, "learning_rate": 2.944290607123038e-06, "logits/chosen": -2.0543644428253174, "logits/rejected": -2.0761032104492188, "logps/chosen": -59.184879302978516, "logps/rejected": -60.5332145690918, "loss": 1.8838, "rewards/accuracies": 0.0, "rewards/chosen": 3.6456830501556396, "rewards/margins": -3.4797823429107666, "rewards/rejected": 7.125465393066406, "step": 5836 }, { "epoch": 1.29, "learning_rate": 2.9426569042809356e-06, "logits/chosen": -1.7588789463043213, "logits/rejected": -1.7335480451583862, "logps/chosen": -56.7681884765625, "logps/rejected": -43.308006286621094, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": 3.0483810901641846, "rewards/margins": 1.496180772781372, "rewards/rejected": 1.5522003173828125, "step": 5837 }, { "epoch": 1.29, "learning_rate": 2.941023465802473e-06, "logits/chosen": -1.7011866569519043, "logits/rejected": -1.592045783996582, "logps/chosen": -51.244468688964844, "logps/rejected": -43.627098083496094, "loss": 0.1271, "rewards/accuracies": 1.0, "rewards/chosen": 2.866767168045044, "rewards/margins": 1.3421417474746704, "rewards/rejected": 1.5246254205703735, "step": 5838 }, { "epoch": 1.29, "learning_rate": 2.939390291897547e-06, "logits/chosen": -2.030754566192627, "logits/rejected": -2.036348819732666, "logps/chosen": -56.727561950683594, "logps/rejected": -48.2567138671875, "loss": 0.4891, "rewards/accuracies": 1.0, "rewards/chosen": 3.944521427154541, "rewards/margins": 0.023612260818481445, "rewards/rejected": 3.9209091663360596, "step": 5839 }, { "epoch": 1.29, "learning_rate": 2.937757382776013e-06, "logits/chosen": -1.8278889656066895, "logits/rejected": -1.9908357858657837, "logps/chosen": -69.81980895996094, "logps/rejected": -169.2550048828125, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": 10.65031623840332, "rewards/margins": 1.6515731811523438, "rewards/rejected": 8.998743057250977, "step": 5840 }, { "epoch": 1.29, "learning_rate": 2.9361247386477006e-06, "logits/chosen": -1.7775979042053223, "logits/rejected": -1.7885032892227173, "logps/chosen": -28.977317810058594, "logps/rejected": -93.26472473144531, "loss": 0.3599, "rewards/accuracies": 1.0, "rewards/chosen": 4.840550899505615, "rewards/margins": 0.14740371704101562, "rewards/rejected": 4.6931471824646, "step": 5841 }, { "epoch": 1.29, "learning_rate": 2.9344923597223962e-06, "logits/chosen": -2.2303802967071533, "logits/rejected": -2.2303802967071533, "logps/chosen": -21.07965087890625, "logps/rejected": -21.07965087890625, "loss": 0.4258, "rewards/accuracies": 0.0, "rewards/chosen": 3.6854207515716553, "rewards/margins": 0.0, "rewards/rejected": 3.6854207515716553, "step": 5842 }, { "epoch": 1.29, "learning_rate": 2.9328602462098595e-06, "logits/chosen": -2.1907050609588623, "logits/rejected": -2.1687984466552734, "logps/chosen": -72.41091918945312, "logps/rejected": -51.40239715576172, "loss": 0.0721, "rewards/accuracies": 1.0, "rewards/chosen": 4.485970973968506, "rewards/margins": 2.035407066345215, "rewards/rejected": 2.450563907623291, "step": 5843 }, { "epoch": 1.29, "learning_rate": 2.93122839831981e-06, "logits/chosen": -2.021855354309082, "logits/rejected": -1.8822399377822876, "logps/chosen": -66.99403381347656, "logps/rejected": -27.33692169189453, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": 4.456000804901123, "rewards/margins": 3.637263059616089, "rewards/rejected": 0.818737804889679, "step": 5844 }, { "epoch": 1.29, "learning_rate": 2.929596816261939e-06, "logits/chosen": -2.041332721710205, "logits/rejected": -2.071378707885742, "logps/chosen": -68.52139282226562, "logps/rejected": -143.32452392578125, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": 8.8058443069458, "rewards/margins": 2.6393465995788574, "rewards/rejected": 6.166497707366943, "step": 5845 }, { "epoch": 1.29, "learning_rate": 2.9279655002458996e-06, "logits/chosen": -1.9959421157836914, "logits/rejected": -2.050631284713745, "logps/chosen": -55.38907241821289, "logps/rejected": -70.9728775024414, "loss": 0.8021, "rewards/accuracies": 0.0, "rewards/chosen": 5.937270641326904, "rewards/margins": -0.39664745330810547, "rewards/rejected": 6.33391809463501, "step": 5846 }, { "epoch": 1.29, "learning_rate": 2.9263344504813134e-06, "logits/chosen": -1.9180461168289185, "logits/rejected": -1.9323967695236206, "logps/chosen": -30.91542625427246, "logps/rejected": -55.34562683105469, "loss": 0.5511, "rewards/accuracies": 1.0, "rewards/chosen": 3.8163559436798096, "rewards/margins": 0.35968637466430664, "rewards/rejected": 3.456669569015503, "step": 5847 }, { "epoch": 1.29, "learning_rate": 2.9247036671777664e-06, "logits/chosen": -2.235820770263672, "logits/rejected": -2.189709186553955, "logps/chosen": -87.16534423828125, "logps/rejected": -42.15557098388672, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": 5.970587253570557, "rewards/margins": 2.2740089893341064, "rewards/rejected": 3.69657826423645, "step": 5848 }, { "epoch": 1.29, "learning_rate": 2.9230731505448063e-06, "logits/chosen": -1.9810867309570312, "logits/rejected": -1.9141247272491455, "logps/chosen": -60.61547088623047, "logps/rejected": -39.741119384765625, "loss": 0.1261, "rewards/accuracies": 1.0, "rewards/chosen": 3.333824872970581, "rewards/margins": 1.9397914409637451, "rewards/rejected": 1.394033432006836, "step": 5849 }, { "epoch": 1.29, "learning_rate": 2.9214429007919553e-06, "logits/chosen": -1.9131481647491455, "logits/rejected": -1.9131481647491455, "logps/chosen": -54.6093864440918, "logps/rejected": -54.6093864440918, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 7.5052170753479, "rewards/margins": 0.0, "rewards/rejected": 7.5052170753479, "step": 5850 }, { "epoch": 1.3, "learning_rate": 2.9198129181286944e-06, "logits/chosen": -2.0790622234344482, "logits/rejected": -2.024625539779663, "logps/chosen": -57.257415771484375, "logps/rejected": -2.4909863471984863, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": 3.474255323410034, "rewards/margins": 2.6769185066223145, "rewards/rejected": 0.7973369359970093, "step": 5851 }, { "epoch": 1.3, "learning_rate": 2.9181832027644728e-06, "logits/chosen": -1.7710243463516235, "logits/rejected": -1.8896647691726685, "logps/chosen": -24.744495391845703, "logps/rejected": -88.93424224853516, "loss": 3.9636, "rewards/accuracies": 0.0, "rewards/chosen": 2.892279863357544, "rewards/margins": -7.919307708740234, "rewards/rejected": 10.8115873336792, "step": 5852 }, { "epoch": 1.3, "learning_rate": 2.9165537549087053e-06, "logits/chosen": -2.183013439178467, "logits/rejected": -2.175804615020752, "logps/chosen": -39.34123229980469, "logps/rejected": -77.58769226074219, "loss": 0.4353, "rewards/accuracies": 0.0, "rewards/chosen": 2.5738937854766846, "rewards/margins": -0.32062220573425293, "rewards/rejected": 2.8945159912109375, "step": 5853 }, { "epoch": 1.3, "learning_rate": 2.9149245747707714e-06, "logits/chosen": -1.8148603439331055, "logits/rejected": -1.7957125902175903, "logps/chosen": -64.04449462890625, "logps/rejected": -72.16484069824219, "loss": 0.4526, "rewards/accuracies": 0.0, "rewards/chosen": 4.370657444000244, "rewards/margins": -0.3810129165649414, "rewards/rejected": 4.7516703605651855, "step": 5854 }, { "epoch": 1.3, "learning_rate": 2.9132956625600174e-06, "logits/chosen": -2.0609841346740723, "logits/rejected": -2.047612428665161, "logps/chosen": -153.15419006347656, "logps/rejected": -93.50677490234375, "loss": 0.1006, "rewards/accuracies": 1.0, "rewards/chosen": 8.752520561218262, "rewards/margins": 1.5225400924682617, "rewards/rejected": 7.22998046875, "step": 5855 }, { "epoch": 1.3, "learning_rate": 2.9116670184857565e-06, "logits/chosen": -2.166516065597534, "logits/rejected": -2.2093746662139893, "logps/chosen": -73.29696655273438, "logps/rejected": -133.16061401367188, "loss": 0.0697, "rewards/accuracies": 1.0, "rewards/chosen": 8.942761421203613, "rewards/margins": 2.2495651245117188, "rewards/rejected": 6.6931962966918945, "step": 5856 }, { "epoch": 1.3, "learning_rate": 2.9100386427572617e-06, "logits/chosen": -1.504822015762329, "logits/rejected": -1.481408715248108, "logps/chosen": -39.81447219848633, "logps/rejected": -28.664165496826172, "loss": 0.6137, "rewards/accuracies": 0.0, "rewards/chosen": 2.592071294784546, "rewards/margins": -0.8205504417419434, "rewards/rejected": 3.4126217365264893, "step": 5857 }, { "epoch": 1.3, "learning_rate": 2.908410535583777e-06, "logits/chosen": -2.2715256214141846, "logits/rejected": -2.310349464416504, "logps/chosen": -109.827392578125, "logps/rejected": -127.81529235839844, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 12.758196830749512, "rewards/margins": 5.6628828048706055, "rewards/rejected": 7.095314025878906, "step": 5858 }, { "epoch": 1.3, "learning_rate": 2.9067826971745105e-06, "logits/chosen": -1.5610483884811401, "logits/rejected": -1.5584936141967773, "logps/chosen": -52.30712127685547, "logps/rejected": -52.79385757446289, "loss": 0.5651, "rewards/accuracies": 1.0, "rewards/chosen": 2.5827949047088623, "rewards/margins": 0.6014857292175293, "rewards/rejected": 1.981309175491333, "step": 5859 }, { "epoch": 1.3, "learning_rate": 2.9051551277386357e-06, "logits/chosen": -1.8294812440872192, "logits/rejected": -1.7542141675949097, "logps/chosen": -101.12928771972656, "logps/rejected": -132.8541259765625, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": 12.66205883026123, "rewards/margins": 3.474048614501953, "rewards/rejected": 9.188010215759277, "step": 5860 }, { "epoch": 1.3, "learning_rate": 2.9035278274852907e-06, "logits/chosen": -2.0757408142089844, "logits/rejected": -2.0209403038024902, "logps/chosen": -48.97163391113281, "logps/rejected": -33.917762756347656, "loss": 0.2619, "rewards/accuracies": 1.0, "rewards/chosen": 2.600261688232422, "rewards/margins": 1.7704112529754639, "rewards/rejected": 0.8298503756523132, "step": 5861 }, { "epoch": 1.3, "learning_rate": 2.90190079662358e-06, "logits/chosen": -2.302565097808838, "logits/rejected": -2.2333080768585205, "logps/chosen": -114.70642852783203, "logps/rejected": -116.8010025024414, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 6.658933162689209, "rewards/margins": 3.388803005218506, "rewards/rejected": 3.270130157470703, "step": 5862 }, { "epoch": 1.3, "learning_rate": 2.9002740353625737e-06, "logits/chosen": -2.096691131591797, "logits/rejected": -2.014728546142578, "logps/chosen": -120.65264129638672, "logps/rejected": -40.30488204956055, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 6.919266700744629, "rewards/margins": 3.8669934272766113, "rewards/rejected": 3.0522732734680176, "step": 5863 }, { "epoch": 1.3, "learning_rate": 2.898647543911306e-06, "logits/chosen": -1.8781558275222778, "logits/rejected": -1.8633116483688354, "logps/chosen": -73.82865905761719, "logps/rejected": -67.39663696289062, "loss": 0.4615, "rewards/accuracies": 1.0, "rewards/chosen": 4.333627223968506, "rewards/margins": 1.3534924983978271, "rewards/rejected": 2.9801347255706787, "step": 5864 }, { "epoch": 1.3, "learning_rate": 2.8970213224787795e-06, "logits/chosen": -1.9830049276351929, "logits/rejected": -1.9556413888931274, "logps/chosen": -52.54188537597656, "logps/rejected": -38.88130569458008, "loss": 0.0847, "rewards/accuracies": 1.0, "rewards/chosen": 4.255870342254639, "rewards/margins": 1.7382025718688965, "rewards/rejected": 2.517667770385742, "step": 5865 }, { "epoch": 1.3, "learning_rate": 2.8953953712739534e-06, "logits/chosen": -2.163552761077881, "logits/rejected": -2.15350341796875, "logps/chosen": -185.61451721191406, "logps/rejected": -83.7369613647461, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": 7.567637920379639, "rewards/margins": 2.8505501747131348, "rewards/rejected": 4.717087745666504, "step": 5866 }, { "epoch": 1.3, "learning_rate": 2.8937696905057677e-06, "logits/chosen": -1.866931438446045, "logits/rejected": -1.5925856828689575, "logps/chosen": -166.39178466796875, "logps/rejected": -36.720394134521484, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.572572231292725, "rewards/margins": 7.028689861297607, "rewards/rejected": -0.4561176300048828, "step": 5867 }, { "epoch": 1.3, "learning_rate": 2.89214428038311e-06, "logits/chosen": -1.951959490776062, "logits/rejected": -2.015500545501709, "logps/chosen": -31.28290557861328, "logps/rejected": -107.15975189208984, "loss": 1.3437, "rewards/accuracies": 1.0, "rewards/chosen": 4.362809181213379, "rewards/margins": 0.1051177978515625, "rewards/rejected": 4.257691383361816, "step": 5868 }, { "epoch": 1.3, "learning_rate": 2.8905191411148466e-06, "logits/chosen": -1.5158789157867432, "logits/rejected": -1.4602853059768677, "logps/chosen": -25.463624954223633, "logps/rejected": -31.707866668701172, "loss": 0.4788, "rewards/accuracies": 0.0, "rewards/chosen": 3.526143789291382, "rewards/margins": -0.4388468265533447, "rewards/rejected": 3.9649906158447266, "step": 5869 }, { "epoch": 1.3, "learning_rate": 2.8888942729098014e-06, "logits/chosen": -1.789524793624878, "logits/rejected": -1.7432644367218018, "logps/chosen": -44.11061096191406, "logps/rejected": -46.52007293701172, "loss": 0.1169, "rewards/accuracies": 1.0, "rewards/chosen": 3.7864654064178467, "rewards/margins": 1.340898036956787, "rewards/rejected": 2.4455673694610596, "step": 5870 }, { "epoch": 1.3, "learning_rate": 2.8872696759767676e-06, "logits/chosen": -1.7676525115966797, "logits/rejected": -1.7896156311035156, "logps/chosen": -50.04836654663086, "logps/rejected": -76.40940856933594, "loss": 0.8085, "rewards/accuracies": 0.0, "rewards/chosen": 1.5882755517959595, "rewards/margins": -1.0249980688095093, "rewards/rejected": 2.6132736206054688, "step": 5871 }, { "epoch": 1.3, "learning_rate": 2.8856453505245018e-06, "logits/chosen": -1.6890394687652588, "logits/rejected": -1.6890991926193237, "logps/chosen": -55.2950439453125, "logps/rejected": -51.01618194580078, "loss": 0.2924, "rewards/accuracies": 1.0, "rewards/chosen": 3.0614898204803467, "rewards/margins": 0.703641414642334, "rewards/rejected": 2.3578484058380127, "step": 5872 }, { "epoch": 1.3, "learning_rate": 2.8840212967617253e-06, "logits/chosen": -2.0517656803131104, "logits/rejected": -2.043207883834839, "logps/chosen": -57.128623962402344, "logps/rejected": -26.08146095275879, "loss": 0.9801, "rewards/accuracies": 0.0, "rewards/chosen": 4.294677257537842, "rewards/margins": -1.7434263229370117, "rewards/rejected": 6.0381035804748535, "step": 5873 }, { "epoch": 1.3, "learning_rate": 2.882397514897128e-06, "logits/chosen": -2.3691413402557373, "logits/rejected": -2.3744077682495117, "logps/chosen": -43.18737030029297, "logps/rejected": -21.815372467041016, "loss": 0.1366, "rewards/accuracies": 1.0, "rewards/chosen": 2.794590711593628, "rewards/margins": 1.7326892614364624, "rewards/rejected": 1.0619014501571655, "step": 5874 }, { "epoch": 1.3, "learning_rate": 2.880774005139355e-06, "logits/chosen": -1.926321268081665, "logits/rejected": -1.8804880380630493, "logps/chosen": -137.6108856201172, "logps/rejected": -43.802818298339844, "loss": 0.0894, "rewards/accuracies": 1.0, "rewards/chosen": 6.2021684646606445, "rewards/margins": 1.678652286529541, "rewards/rejected": 4.5235161781311035, "step": 5875 }, { "epoch": 1.3, "learning_rate": 2.879150767697032e-06, "logits/chosen": -1.7342740297317505, "logits/rejected": -1.5998847484588623, "logps/chosen": -25.956567764282227, "logps/rejected": -68.68193054199219, "loss": 0.1487, "rewards/accuracies": 1.0, "rewards/chosen": 6.797345161437988, "rewards/margins": 3.209887742996216, "rewards/rejected": 3.5874574184417725, "step": 5876 }, { "epoch": 1.3, "learning_rate": 2.8775278027787344e-06, "logits/chosen": -1.903795599937439, "logits/rejected": -1.9439046382904053, "logps/chosen": -81.63475036621094, "logps/rejected": -133.13955688476562, "loss": 0.9962, "rewards/accuracies": 0.0, "rewards/chosen": 6.501387119293213, "rewards/margins": -1.8093457221984863, "rewards/rejected": 8.3107328414917, "step": 5877 }, { "epoch": 1.3, "learning_rate": 2.8759051105930124e-06, "logits/chosen": -1.9088985919952393, "logits/rejected": -1.909149169921875, "logps/chosen": -74.17741394042969, "logps/rejected": -75.78192901611328, "loss": 0.3027, "rewards/accuracies": 1.0, "rewards/chosen": 3.8972671031951904, "rewards/margins": 0.45416927337646484, "rewards/rejected": 3.4430978298187256, "step": 5878 }, { "epoch": 1.3, "learning_rate": 2.874282691348378e-06, "logits/chosen": -1.7657999992370605, "logits/rejected": -1.660306692123413, "logps/chosen": -53.166221618652344, "logps/rejected": -8.443251609802246, "loss": 0.3499, "rewards/accuracies": 1.0, "rewards/chosen": 5.626323699951172, "rewards/margins": 4.998691082000732, "rewards/rejected": 0.6276327967643738, "step": 5879 }, { "epoch": 1.3, "learning_rate": 2.8726605452533073e-06, "logits/chosen": -2.371706962585449, "logits/rejected": -2.3286423683166504, "logps/chosen": -98.5161361694336, "logps/rejected": -61.56441879272461, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": 5.593642711639404, "rewards/margins": 3.9604856967926025, "rewards/rejected": 1.6331570148468018, "step": 5880 }, { "epoch": 1.3, "learning_rate": 2.8710386725162444e-06, "logits/chosen": -2.038862705230713, "logits/rejected": -2.013275623321533, "logps/chosen": -40.75315856933594, "logps/rejected": -55.05561065673828, "loss": 0.1745, "rewards/accuracies": 1.0, "rewards/chosen": 6.060003757476807, "rewards/margins": 0.9400854110717773, "rewards/rejected": 5.119918346405029, "step": 5881 }, { "epoch": 1.3, "learning_rate": 2.86941707334559e-06, "logits/chosen": -1.709261417388916, "logits/rejected": -1.6959104537963867, "logps/chosen": -44.9709587097168, "logps/rejected": -49.856788635253906, "loss": 0.2322, "rewards/accuracies": 1.0, "rewards/chosen": 2.507181167602539, "rewards/margins": 0.5753574371337891, "rewards/rejected": 1.93182373046875, "step": 5882 }, { "epoch": 1.3, "learning_rate": 2.867795747949724e-06, "logits/chosen": -1.8340777158737183, "logits/rejected": -1.8128429651260376, "logps/chosen": -60.929771423339844, "logps/rejected": -79.156005859375, "loss": 0.0737, "rewards/accuracies": 1.0, "rewards/chosen": 5.498514652252197, "rewards/margins": 2.0433807373046875, "rewards/rejected": 3.4551339149475098, "step": 5883 }, { "epoch": 1.3, "learning_rate": 2.866174696536975e-06, "logits/chosen": -1.9899722337722778, "logits/rejected": -1.9906020164489746, "logps/chosen": -13.236776351928711, "logps/rejected": -41.66885757446289, "loss": 0.4833, "rewards/accuracies": 0.0, "rewards/chosen": 2.644037961959839, "rewards/margins": -0.34855055809020996, "rewards/rejected": 2.992588520050049, "step": 5884 }, { "epoch": 1.3, "learning_rate": 2.864553919315652e-06, "logits/chosen": -1.7975444793701172, "logits/rejected": -1.7626848220825195, "logps/chosen": -53.596275329589844, "logps/rejected": -40.11328887939453, "loss": 0.1981, "rewards/accuracies": 1.0, "rewards/chosen": 3.464111328125, "rewards/margins": 0.892066240310669, "rewards/rejected": 2.572045087814331, "step": 5885 }, { "epoch": 1.3, "learning_rate": 2.8629334164940127e-06, "logits/chosen": -1.9249364137649536, "logits/rejected": -1.8418713808059692, "logps/chosen": -129.03378295898438, "logps/rejected": -92.44072723388672, "loss": 0.087, "rewards/accuracies": 1.0, "rewards/chosen": 5.595120429992676, "rewards/margins": 3.672278881072998, "rewards/rejected": 1.9228416681289673, "step": 5886 }, { "epoch": 1.3, "learning_rate": 2.861313188280296e-06, "logits/chosen": -1.781302809715271, "logits/rejected": -1.7137587070465088, "logps/chosen": -80.88105010986328, "logps/rejected": -106.55142211914062, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 6.648879528045654, "rewards/margins": 3.1005518436431885, "rewards/rejected": 3.548327684402466, "step": 5887 }, { "epoch": 1.3, "learning_rate": 2.859693234882691e-06, "logits/chosen": -1.6127569675445557, "logits/rejected": -1.554119348526001, "logps/chosen": -26.33021354675293, "logps/rejected": -57.873321533203125, "loss": 0.2759, "rewards/accuracies": 1.0, "rewards/chosen": 2.3128879070281982, "rewards/margins": 0.37342023849487305, "rewards/rejected": 1.9394676685333252, "step": 5888 }, { "epoch": 1.3, "learning_rate": 2.8580735565093607e-06, "logits/chosen": -1.9384993314743042, "logits/rejected": -1.9423174858093262, "logps/chosen": -22.3885498046875, "logps/rejected": -56.954002380371094, "loss": 2.3824, "rewards/accuracies": 0.0, "rewards/chosen": 2.8053436279296875, "rewards/margins": -0.36990904808044434, "rewards/rejected": 3.175252676010132, "step": 5889 }, { "epoch": 1.3, "learning_rate": 2.856454153368431e-06, "logits/chosen": -2.042630195617676, "logits/rejected": -1.9219646453857422, "logps/chosen": -112.03022766113281, "logps/rejected": -63.52980041503906, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 8.631695747375488, "rewards/margins": 4.46480131149292, "rewards/rejected": 4.166894435882568, "step": 5890 }, { "epoch": 1.3, "learning_rate": 2.854835025667986e-06, "logits/chosen": -1.9545282125473022, "logits/rejected": -1.9650179147720337, "logps/chosen": -28.893463134765625, "logps/rejected": -107.25016784667969, "loss": 0.2282, "rewards/accuracies": 1.0, "rewards/chosen": 4.092573642730713, "rewards/margins": 1.410714864730835, "rewards/rejected": 2.681858777999878, "step": 5891 }, { "epoch": 1.3, "learning_rate": 2.8532161736160866e-06, "logits/chosen": -2.2459304332733154, "logits/rejected": -2.254293203353882, "logps/chosen": -50.24072265625, "logps/rejected": -60.83110046386719, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": 5.881727695465088, "rewards/margins": 2.405778646469116, "rewards/rejected": 3.4759490489959717, "step": 5892 }, { "epoch": 1.3, "learning_rate": 2.8515975974207454e-06, "logits/chosen": -1.7764132022857666, "logits/rejected": -1.6983081102371216, "logps/chosen": -41.86091613769531, "logps/rejected": -50.76811599731445, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 5.81695556640625, "rewards/margins": 4.639346599578857, "rewards/rejected": 1.177608847618103, "step": 5893 }, { "epoch": 1.3, "learning_rate": 2.8499792972899517e-06, "logits/chosen": -1.9067928791046143, "logits/rejected": -1.9036349058151245, "logps/chosen": -62.02111053466797, "logps/rejected": -62.56450653076172, "loss": 0.7035, "rewards/accuracies": 0.0, "rewards/chosen": 5.091135501861572, "rewards/margins": -1.0262665748596191, "rewards/rejected": 6.117402076721191, "step": 5894 }, { "epoch": 1.3, "learning_rate": 2.8483612734316456e-06, "logits/chosen": -1.8941304683685303, "logits/rejected": -1.8863567113876343, "logps/chosen": -67.08834838867188, "logps/rejected": -48.46086502075195, "loss": 0.2057, "rewards/accuracies": 1.0, "rewards/chosen": 4.7855224609375, "rewards/margins": 0.7858684062957764, "rewards/rejected": 3.9996540546417236, "step": 5895 }, { "epoch": 1.31, "learning_rate": 2.8467435260537475e-06, "logits/chosen": -2.43786358833313, "logits/rejected": -2.4661991596221924, "logps/chosen": -114.12403106689453, "logps/rejected": -109.31353759765625, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": 5.365349769592285, "rewards/margins": 4.72265100479126, "rewards/rejected": 0.6426987051963806, "step": 5896 }, { "epoch": 1.31, "learning_rate": 2.845126055364128e-06, "logits/chosen": -2.3695406913757324, "logits/rejected": -2.3358402252197266, "logps/chosen": -75.25665283203125, "logps/rejected": -102.48929595947266, "loss": 0.0649, "rewards/accuracies": 1.0, "rewards/chosen": 7.670922756195068, "rewards/margins": 1.980893611907959, "rewards/rejected": 5.690029144287109, "step": 5897 }, { "epoch": 1.31, "learning_rate": 2.8435088615706302e-06, "logits/chosen": -1.8694788217544556, "logits/rejected": -1.793027639389038, "logps/chosen": -75.21726989746094, "logps/rejected": -84.24759674072266, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 3.468951463699341, "rewards/margins": 0.4719703197479248, "rewards/rejected": 2.996981143951416, "step": 5898 }, { "epoch": 1.31, "learning_rate": 2.8418919448810623e-06, "logits/chosen": -1.5955870151519775, "logits/rejected": -1.6164605617523193, "logps/chosen": -44.83476257324219, "logps/rejected": -58.85334777832031, "loss": 0.4362, "rewards/accuracies": 0.0, "rewards/chosen": 3.3384628295898438, "rewards/margins": -0.2217719554901123, "rewards/rejected": 3.560234785079956, "step": 5899 }, { "epoch": 1.31, "learning_rate": 2.840275305503186e-06, "logits/chosen": -1.7929927110671997, "logits/rejected": -1.825019359588623, "logps/chosen": -36.75439453125, "logps/rejected": -58.49003601074219, "loss": 0.1714, "rewards/accuracies": 1.0, "rewards/chosen": 3.7151496410369873, "rewards/margins": 1.3236029148101807, "rewards/rejected": 2.3915467262268066, "step": 5900 }, { "epoch": 1.31, "learning_rate": 2.8386589436447465e-06, "logits/chosen": -2.198141098022461, "logits/rejected": -2.178640842437744, "logps/chosen": -56.02466583251953, "logps/rejected": -32.399539947509766, "loss": 0.0695, "rewards/accuracies": 1.0, "rewards/chosen": 4.21942138671875, "rewards/margins": 2.2389400005340576, "rewards/rejected": 1.9804813861846924, "step": 5901 }, { "epoch": 1.31, "learning_rate": 2.8370428595134327e-06, "logits/chosen": -1.8893187046051025, "logits/rejected": -1.7792397737503052, "logps/chosen": -48.10301971435547, "logps/rejected": -37.77781677246094, "loss": 0.5483, "rewards/accuracies": 0.0, "rewards/chosen": 2.1146767139434814, "rewards/margins": -0.6100082397460938, "rewards/rejected": 2.724684953689575, "step": 5902 }, { "epoch": 1.31, "learning_rate": 2.835427053316916e-06, "logits/chosen": -1.8653737306594849, "logits/rejected": -1.8208683729171753, "logps/chosen": -59.41862106323242, "logps/rejected": -66.71357727050781, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 4.867452621459961, "rewards/margins": 2.168180465698242, "rewards/rejected": 2.6992721557617188, "step": 5903 }, { "epoch": 1.31, "learning_rate": 2.8338115252628175e-06, "logits/chosen": -2.312920331954956, "logits/rejected": -2.3195903301239014, "logps/chosen": -52.093658447265625, "logps/rejected": -137.42401123046875, "loss": 0.2916, "rewards/accuracies": 1.0, "rewards/chosen": 6.549447536468506, "rewards/margins": 0.4459228515625, "rewards/rejected": 6.103524684906006, "step": 5904 }, { "epoch": 1.31, "learning_rate": 2.8321962755587295e-06, "logits/chosen": -1.9839376211166382, "logits/rejected": -1.9963451623916626, "logps/chosen": -37.99992370605469, "logps/rejected": -66.62517547607422, "loss": 0.5354, "rewards/accuracies": 0.0, "rewards/chosen": 5.259095668792725, "rewards/margins": -0.5572719573974609, "rewards/rejected": 5.8163676261901855, "step": 5905 }, { "epoch": 1.31, "learning_rate": 2.83058130441221e-06, "logits/chosen": -2.042813539505005, "logits/rejected": -1.9981166124343872, "logps/chosen": -122.25930786132812, "logps/rejected": -108.90203094482422, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": 8.298089981079102, "rewards/margins": 2.337742328643799, "rewards/rejected": 5.960347652435303, "step": 5906 }, { "epoch": 1.31, "learning_rate": 2.8289666120307773e-06, "logits/chosen": -1.6128194332122803, "logits/rejected": -1.6402862071990967, "logps/chosen": -23.683147430419922, "logps/rejected": -73.62767791748047, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": 3.476597309112549, "rewards/margins": 2.3789875507354736, "rewards/rejected": 1.0976097583770752, "step": 5907 }, { "epoch": 1.31, "learning_rate": 2.827352198621918e-06, "logits/chosen": -2.1956019401550293, "logits/rejected": -1.7117351293563843, "logps/chosen": -121.84129333496094, "logps/rejected": -48.803436279296875, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": 8.378748893737793, "rewards/margins": 2.954543113708496, "rewards/rejected": 5.424205780029297, "step": 5908 }, { "epoch": 1.31, "learning_rate": 2.8257380643930744e-06, "logits/chosen": -2.320150136947632, "logits/rejected": -2.3717806339263916, "logps/chosen": -159.44601440429688, "logps/rejected": -88.39186096191406, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": 9.48527717590332, "rewards/margins": 5.4935760498046875, "rewards/rejected": 3.9917008876800537, "step": 5909 }, { "epoch": 1.31, "learning_rate": 2.824124209551667e-06, "logits/chosen": -2.103344440460205, "logits/rejected": -2.0734987258911133, "logps/chosen": -108.49547576904297, "logps/rejected": -120.23111724853516, "loss": 0.4675, "rewards/accuracies": 0.0, "rewards/chosen": 9.89469051361084, "rewards/margins": -0.2548847198486328, "rewards/rejected": 10.149575233459473, "step": 5910 }, { "epoch": 1.31, "learning_rate": 2.8225106343050635e-06, "logits/chosen": -1.736642599105835, "logits/rejected": -1.7340489625930786, "logps/chosen": -36.21495056152344, "logps/rejected": -61.11064147949219, "loss": 1.9819, "rewards/accuracies": 0.0, "rewards/chosen": 3.9165306091308594, "rewards/margins": -1.7856636047363281, "rewards/rejected": 5.7021942138671875, "step": 5911 }, { "epoch": 1.31, "learning_rate": 2.8208973388606133e-06, "logits/chosen": -1.8458521366119385, "logits/rejected": -1.7009474039077759, "logps/chosen": -52.260746002197266, "logps/rejected": -46.63175964355469, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": 3.1215596199035645, "rewards/margins": 3.456676483154297, "rewards/rejected": -0.3351169526576996, "step": 5912 }, { "epoch": 1.31, "learning_rate": 2.819284323425614e-06, "logits/chosen": -1.8787882328033447, "logits/rejected": -1.899559497833252, "logps/chosen": -43.94861602783203, "logps/rejected": -148.6048583984375, "loss": 1.0663, "rewards/accuracies": 0.0, "rewards/chosen": 4.681996822357178, "rewards/margins": -2.006164073944092, "rewards/rejected": 6.6881608963012695, "step": 5913 }, { "epoch": 1.31, "learning_rate": 2.817671588207338e-06, "logits/chosen": -2.2200965881347656, "logits/rejected": -2.2558765411376953, "logps/chosen": -54.93654251098633, "logps/rejected": -121.99475860595703, "loss": 0.3084, "rewards/accuracies": 1.0, "rewards/chosen": 7.708824157714844, "rewards/margins": 0.24872207641601562, "rewards/rejected": 7.460102081298828, "step": 5914 }, { "epoch": 1.31, "learning_rate": 2.816059133413016e-06, "logits/chosen": -1.8911628723144531, "logits/rejected": -1.8833483457565308, "logps/chosen": -46.730247497558594, "logps/rejected": -46.61885452270508, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": 5.495255470275879, "rewards/margins": 2.27659273147583, "rewards/rejected": 3.218662738800049, "step": 5915 }, { "epoch": 1.31, "learning_rate": 2.8144469592498445e-06, "logits/chosen": -2.1256637573242188, "logits/rejected": -2.028712272644043, "logps/chosen": -59.64397048950195, "logps/rejected": -82.19815063476562, "loss": 0.1274, "rewards/accuracies": 1.0, "rewards/chosen": 6.055472373962402, "rewards/margins": 3.168807029724121, "rewards/rejected": 2.8866653442382812, "step": 5916 }, { "epoch": 1.31, "learning_rate": 2.812835065924986e-06, "logits/chosen": -1.918629765510559, "logits/rejected": -1.8793845176696777, "logps/chosen": -56.889198303222656, "logps/rejected": -43.665855407714844, "loss": 0.2759, "rewards/accuracies": 1.0, "rewards/chosen": 4.616947174072266, "rewards/margins": 0.34584712982177734, "rewards/rejected": 4.271100044250488, "step": 5917 }, { "epoch": 1.31, "learning_rate": 2.811223453645564e-06, "logits/chosen": -1.8518449068069458, "logits/rejected": -1.8467696905136108, "logps/chosen": -47.9925537109375, "logps/rejected": -62.115352630615234, "loss": 0.419, "rewards/accuracies": 0.0, "rewards/chosen": 3.003248691558838, "rewards/margins": -0.2399752140045166, "rewards/rejected": 3.2432239055633545, "step": 5918 }, { "epoch": 1.31, "learning_rate": 2.809612122618668e-06, "logits/chosen": -2.110318899154663, "logits/rejected": -2.116525411605835, "logps/chosen": -67.81590270996094, "logps/rejected": -84.05789184570312, "loss": 1.2356, "rewards/accuracies": 0.0, "rewards/chosen": 7.264506816864014, "rewards/margins": -2.3429551124572754, "rewards/rejected": 9.607461929321289, "step": 5919 }, { "epoch": 1.31, "learning_rate": 2.808001073051344e-06, "logits/chosen": -1.6804304122924805, "logits/rejected": -1.5634663105010986, "logps/chosen": -115.57012939453125, "logps/rejected": -54.35343551635742, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 5.8033127784729, "rewards/margins": 3.3042476177215576, "rewards/rejected": 2.4990651607513428, "step": 5920 }, { "epoch": 1.31, "learning_rate": 2.8063903051506175e-06, "logits/chosen": -2.281585454940796, "logits/rejected": -2.2025890350341797, "logps/chosen": -33.107025146484375, "logps/rejected": -6.054140090942383, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": 3.2496566772460938, "rewards/margins": 2.4328317642211914, "rewards/rejected": 0.8168248534202576, "step": 5921 }, { "epoch": 1.31, "learning_rate": 2.8047798191234614e-06, "logits/chosen": -1.9594498872756958, "logits/rejected": -1.8976006507873535, "logps/chosen": -121.66064453125, "logps/rejected": -51.69361114501953, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 6.1351165771484375, "rewards/margins": 3.2371084690093994, "rewards/rejected": 2.898008108139038, "step": 5922 }, { "epoch": 1.31, "learning_rate": 2.8031696151768216e-06, "logits/chosen": -1.9477849006652832, "logits/rejected": -1.918889045715332, "logps/chosen": -35.05399703979492, "logps/rejected": -38.23230743408203, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 3.1056339740753174, "rewards/margins": 0.05611085891723633, "rewards/rejected": 3.049523115158081, "step": 5923 }, { "epoch": 1.31, "learning_rate": 2.8015596935176047e-06, "logits/chosen": -2.274996280670166, "logits/rejected": -2.2363314628601074, "logps/chosen": -82.00343322753906, "logps/rejected": -40.640621185302734, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 7.395674228668213, "rewards/margins": 3.193516254425049, "rewards/rejected": 4.202157974243164, "step": 5924 }, { "epoch": 1.31, "learning_rate": 2.7999500543526827e-06, "logits/chosen": -1.7619701623916626, "logits/rejected": -1.7696516513824463, "logps/chosen": -3.148820638656616, "logps/rejected": -18.047182083129883, "loss": 0.6077, "rewards/accuracies": 0.0, "rewards/chosen": 0.7971031069755554, "rewards/margins": -0.712602436542511, "rewards/rejected": 1.5097055435180664, "step": 5925 }, { "epoch": 1.31, "learning_rate": 2.7983406978888896e-06, "logits/chosen": -1.8240814208984375, "logits/rejected": -1.8195123672485352, "logps/chosen": -25.388046264648438, "logps/rejected": -38.33662414550781, "loss": 0.3079, "rewards/accuracies": 1.0, "rewards/chosen": 3.5383756160736084, "rewards/margins": 0.16141176223754883, "rewards/rejected": 3.3769638538360596, "step": 5926 }, { "epoch": 1.31, "learning_rate": 2.7967316243330243e-06, "logits/chosen": -2.053239583969116, "logits/rejected": -2.0437304973602295, "logps/chosen": -42.40525436401367, "logps/rejected": -68.40887451171875, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": 4.7254414558410645, "rewards/margins": 3.200079917907715, "rewards/rejected": 1.5253616571426392, "step": 5927 }, { "epoch": 1.31, "learning_rate": 2.7951228338918506e-06, "logits/chosen": -1.9091918468475342, "logits/rejected": -1.8757281303405762, "logps/chosen": -82.20964050292969, "logps/rejected": -89.07823944091797, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": 5.861756801605225, "rewards/margins": 1.8356022834777832, "rewards/rejected": 4.026154518127441, "step": 5928 }, { "epoch": 1.31, "learning_rate": 2.793514326772089e-06, "logits/chosen": -1.7798118591308594, "logits/rejected": -1.6682746410369873, "logps/chosen": -93.01405334472656, "logps/rejected": -83.48634338378906, "loss": 0.0964, "rewards/accuracies": 1.0, "rewards/chosen": 6.212243556976318, "rewards/margins": 1.7219467163085938, "rewards/rejected": 4.490296840667725, "step": 5929 }, { "epoch": 1.31, "learning_rate": 2.791906103180435e-06, "logits/chosen": -1.9844597578048706, "logits/rejected": -2.001469135284424, "logps/chosen": -25.555570602416992, "logps/rejected": -45.224266052246094, "loss": 0.2679, "rewards/accuracies": 1.0, "rewards/chosen": 3.632283926010132, "rewards/margins": 1.0071091651916504, "rewards/rejected": 2.6251747608184814, "step": 5930 }, { "epoch": 1.31, "learning_rate": 2.790298163323538e-06, "logits/chosen": -2.153012752532959, "logits/rejected": -2.130051851272583, "logps/chosen": -34.884944915771484, "logps/rejected": -37.52241134643555, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": 2.527496814727783, "rewards/margins": 1.3049263954162598, "rewards/rejected": 1.2225704193115234, "step": 5931 }, { "epoch": 1.31, "learning_rate": 2.788690507408014e-06, "logits/chosen": -1.9654041528701782, "logits/rejected": -1.9432518482208252, "logps/chosen": -71.65802001953125, "logps/rejected": -67.85995483398438, "loss": 0.1974, "rewards/accuracies": 1.0, "rewards/chosen": 2.6673552989959717, "rewards/margins": 0.778427004814148, "rewards/rejected": 1.8889282941818237, "step": 5932 }, { "epoch": 1.31, "learning_rate": 2.787083135640445e-06, "logits/chosen": -2.2646002769470215, "logits/rejected": -2.2556276321411133, "logps/chosen": -26.751386642456055, "logps/rejected": -52.89766311645508, "loss": 0.1088, "rewards/accuracies": 1.0, "rewards/chosen": 2.8068273067474365, "rewards/margins": 1.518844723701477, "rewards/rejected": 1.2879825830459595, "step": 5933 }, { "epoch": 1.31, "learning_rate": 2.7854760482273722e-06, "logits/chosen": -1.7623229026794434, "logits/rejected": -1.7452948093414307, "logps/chosen": -169.09564208984375, "logps/rejected": -92.43492126464844, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 9.713464736938477, "rewards/margins": 6.950290203094482, "rewards/rejected": 2.763174533843994, "step": 5934 }, { "epoch": 1.31, "learning_rate": 2.783869245375304e-06, "logits/chosen": -1.7827317714691162, "logits/rejected": -1.7261847257614136, "logps/chosen": -43.235225677490234, "logps/rejected": -50.425392150878906, "loss": 0.2529, "rewards/accuracies": 1.0, "rewards/chosen": 2.061548948287964, "rewards/margins": 1.6632167100906372, "rewards/rejected": 0.3983322083950043, "step": 5935 }, { "epoch": 1.31, "learning_rate": 2.782262727290711e-06, "logits/chosen": -2.086268901824951, "logits/rejected": -2.074704170227051, "logps/chosen": -74.16897583007812, "logps/rejected": -69.8077621459961, "loss": 0.3514, "rewards/accuracies": 1.0, "rewards/chosen": 4.589763164520264, "rewards/margins": 0.25017547607421875, "rewards/rejected": 4.339587688446045, "step": 5936 }, { "epoch": 1.31, "learning_rate": 2.7806564941800253e-06, "logits/chosen": -2.027933120727539, "logits/rejected": -1.6712521314620972, "logps/chosen": -42.103485107421875, "logps/rejected": -206.929931640625, "loss": 0.433, "rewards/accuracies": 0.0, "rewards/chosen": 7.439435005187988, "rewards/margins": -0.28521728515625, "rewards/rejected": 7.724652290344238, "step": 5937 }, { "epoch": 1.31, "learning_rate": 2.7790505462496453e-06, "logits/chosen": -1.7818583250045776, "logits/rejected": -1.836305856704712, "logps/chosen": -69.70179748535156, "logps/rejected": -75.29212951660156, "loss": 0.1564, "rewards/accuracies": 1.0, "rewards/chosen": 6.993536472320557, "rewards/margins": 2.4494924545288086, "rewards/rejected": 4.544044017791748, "step": 5938 }, { "epoch": 1.31, "learning_rate": 2.777444883705933e-06, "logits/chosen": -1.7125604152679443, "logits/rejected": -1.5080616474151611, "logps/chosen": -131.72592163085938, "logps/rejected": -75.99945068359375, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 6.319958686828613, "rewards/margins": 3.19877552986145, "rewards/rejected": 3.121183156967163, "step": 5939 }, { "epoch": 1.31, "learning_rate": 2.775839506755209e-06, "logits/chosen": -1.7973482608795166, "logits/rejected": -1.7803566455841064, "logps/chosen": -92.32815551757812, "logps/rejected": -106.05137634277344, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": 10.865180969238281, "rewards/margins": 2.519200325012207, "rewards/rejected": 8.345980644226074, "step": 5940 }, { "epoch": 1.31, "learning_rate": 2.774234415603761e-06, "logits/chosen": -1.5948009490966797, "logits/rejected": -1.574149489402771, "logps/chosen": -52.775421142578125, "logps/rejected": -47.19151306152344, "loss": 0.2395, "rewards/accuracies": 1.0, "rewards/chosen": 5.361331939697266, "rewards/margins": 0.5822563171386719, "rewards/rejected": 4.779075622558594, "step": 5941 }, { "epoch": 1.32, "learning_rate": 2.77262961045784e-06, "logits/chosen": -1.7778033018112183, "logits/rejected": -1.8285764455795288, "logps/chosen": -47.33902359008789, "logps/rejected": -134.89273071289062, "loss": 0.2101, "rewards/accuracies": 1.0, "rewards/chosen": 8.48754596710205, "rewards/margins": 1.0574097633361816, "rewards/rejected": 7.430136203765869, "step": 5942 }, { "epoch": 1.32, "learning_rate": 2.7710250915236598e-06, "logits/chosen": -1.9360487461090088, "logits/rejected": -1.8373242616653442, "logps/chosen": -73.60139465332031, "logps/rejected": -43.151573181152344, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 8.4293851852417, "rewards/margins": 5.73100471496582, "rewards/rejected": 2.6983802318573, "step": 5943 }, { "epoch": 1.32, "learning_rate": 2.769420859007397e-06, "logits/chosen": -1.7560323476791382, "logits/rejected": -1.7398134469985962, "logps/chosen": -32.49147033691406, "logps/rejected": -38.93259048461914, "loss": 0.3801, "rewards/accuracies": 1.0, "rewards/chosen": 3.2596306800842285, "rewards/margins": 0.27920031547546387, "rewards/rejected": 2.9804303646087646, "step": 5944 }, { "epoch": 1.32, "learning_rate": 2.7678169131151912e-06, "logits/chosen": -2.4983410835266113, "logits/rejected": -1.8813843727111816, "logps/chosen": -72.36125183105469, "logps/rejected": -59.913330078125, "loss": 0.1728, "rewards/accuracies": 1.0, "rewards/chosen": 4.621299743652344, "rewards/margins": 2.3465025424957275, "rewards/rejected": 2.274797201156616, "step": 5945 }, { "epoch": 1.32, "learning_rate": 2.7662132540531465e-06, "logits/chosen": -1.799181342124939, "logits/rejected": -1.784466028213501, "logps/chosen": -95.84208679199219, "logps/rejected": -60.495758056640625, "loss": 0.1771, "rewards/accuracies": 1.0, "rewards/chosen": 6.798535346984863, "rewards/margins": 1.043304443359375, "rewards/rejected": 5.755230903625488, "step": 5946 }, { "epoch": 1.32, "learning_rate": 2.764609882027331e-06, "logits/chosen": -1.6742700338363647, "logits/rejected": -1.6719251871109009, "logps/chosen": -44.46253204345703, "logps/rejected": -68.28688049316406, "loss": 1.289, "rewards/accuracies": 0.0, "rewards/chosen": 2.992440938949585, "rewards/margins": -2.442798376083374, "rewards/rejected": 5.435239315032959, "step": 5947 }, { "epoch": 1.32, "learning_rate": 2.7630067972437672e-06, "logits/chosen": -1.9946333169937134, "logits/rejected": -1.6442277431488037, "logps/chosen": -186.13217163085938, "logps/rejected": -87.4278564453125, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 7.462436199188232, "rewards/margins": 3.5815417766571045, "rewards/rejected": 3.880894422531128, "step": 5948 }, { "epoch": 1.32, "learning_rate": 2.7614039999084574e-06, "logits/chosen": -2.0170257091522217, "logits/rejected": -2.0278308391571045, "logps/chosen": -98.02745056152344, "logps/rejected": -149.62936401367188, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": 9.097712516784668, "rewards/margins": 2.9350032806396484, "rewards/rejected": 6.1627092361450195, "step": 5949 }, { "epoch": 1.32, "learning_rate": 2.7598014902273495e-06, "logits/chosen": -1.7210451364517212, "logits/rejected": -1.7264314889907837, "logps/chosen": -40.80585479736328, "logps/rejected": -53.0650520324707, "loss": 0.6215, "rewards/accuracies": 1.0, "rewards/chosen": 2.887535810470581, "rewards/margins": 0.3104517459869385, "rewards/rejected": 2.5770840644836426, "step": 5950 }, { "epoch": 1.32, "learning_rate": 2.7581992684063645e-06, "logits/chosen": -1.9162604808807373, "logits/rejected": -1.919546365737915, "logps/chosen": -53.70714569091797, "logps/rejected": -41.708099365234375, "loss": 0.1379, "rewards/accuracies": 1.0, "rewards/chosen": 4.211124420166016, "rewards/margins": 1.1654052734375, "rewards/rejected": 3.0457191467285156, "step": 5951 }, { "epoch": 1.32, "learning_rate": 2.756597334651385e-06, "logits/chosen": -1.9664452075958252, "logits/rejected": -1.9062671661376953, "logps/chosen": -30.417831420898438, "logps/rejected": -14.182766914367676, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 3.3744752407073975, "rewards/margins": 2.819783926010132, "rewards/rejected": 0.5546912550926208, "step": 5952 }, { "epoch": 1.32, "learning_rate": 2.754995689168255e-06, "logits/chosen": -2.116929054260254, "logits/rejected": -2.0873711109161377, "logps/chosen": -78.32955932617188, "logps/rejected": -72.59517669677734, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 7.195569038391113, "rewards/margins": 2.4237542152404785, "rewards/rejected": 4.771814823150635, "step": 5953 }, { "epoch": 1.32, "learning_rate": 2.7533943321627815e-06, "logits/chosen": -1.799573302268982, "logits/rejected": -1.7031139135360718, "logps/chosen": -154.58984375, "logps/rejected": -66.5566177368164, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 8.7639741897583, "rewards/margins": 5.080109596252441, "rewards/rejected": 3.6838645935058594, "step": 5954 }, { "epoch": 1.32, "learning_rate": 2.7517932638407364e-06, "logits/chosen": -1.8903188705444336, "logits/rejected": -1.8862234354019165, "logps/chosen": -62.246551513671875, "logps/rejected": -62.33962631225586, "loss": 0.1982, "rewards/accuracies": 1.0, "rewards/chosen": 5.065097332000732, "rewards/margins": 1.3065755367279053, "rewards/rejected": 3.758521795272827, "step": 5955 }, { "epoch": 1.32, "learning_rate": 2.7501924844078538e-06, "logits/chosen": -1.8987795114517212, "logits/rejected": -1.8987795114517212, "logps/chosen": -14.07273006439209, "logps/rejected": -14.07273006439209, "loss": 0.516, "rewards/accuracies": 0.0, "rewards/chosen": 1.9845551252365112, "rewards/margins": 0.0, "rewards/rejected": 1.9845551252365112, "step": 5956 }, { "epoch": 1.32, "learning_rate": 2.7485919940698245e-06, "logits/chosen": -1.7347934246063232, "logits/rejected": -1.5562928915023804, "logps/chosen": -122.3707504272461, "logps/rejected": -63.364166259765625, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 12.30447006225586, "rewards/margins": 7.39837646484375, "rewards/rejected": 4.906093597412109, "step": 5957 }, { "epoch": 1.32, "learning_rate": 2.7469917930323163e-06, "logits/chosen": -1.7948088645935059, "logits/rejected": -1.7605109214782715, "logps/chosen": -36.72913360595703, "logps/rejected": -107.43572998046875, "loss": 0.349, "rewards/accuracies": 1.0, "rewards/chosen": 2.449084520339966, "rewards/margins": 0.11757206916809082, "rewards/rejected": 2.331512451171875, "step": 5958 }, { "epoch": 1.32, "learning_rate": 2.745391881500944e-06, "logits/chosen": -2.0013504028320312, "logits/rejected": -1.9241176843643188, "logps/chosen": -42.301475524902344, "logps/rejected": -18.09201431274414, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": 2.7005226612091064, "rewards/margins": 2.462116003036499, "rewards/rejected": 0.23840676248073578, "step": 5959 }, { "epoch": 1.32, "learning_rate": 2.7437922596812947e-06, "logits/chosen": -1.9298096895217896, "logits/rejected": -1.9509830474853516, "logps/chosen": -118.07897186279297, "logps/rejected": -110.93539428710938, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": 6.922606945037842, "rewards/margins": 1.530385971069336, "rewards/rejected": 5.392220973968506, "step": 5960 }, { "epoch": 1.32, "learning_rate": 2.742192927778917e-06, "logits/chosen": -2.067811965942383, "logits/rejected": -2.0165927410125732, "logps/chosen": -47.09397506713867, "logps/rejected": -62.728878021240234, "loss": 0.2909, "rewards/accuracies": 1.0, "rewards/chosen": 3.4472568035125732, "rewards/margins": 0.347137451171875, "rewards/rejected": 3.1001193523406982, "step": 5961 }, { "epoch": 1.32, "learning_rate": 2.740593885999321e-06, "logits/chosen": -1.9567173719406128, "logits/rejected": -1.915635585784912, "logps/chosen": -95.47102355957031, "logps/rejected": -88.16484069824219, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": 8.235228538513184, "rewards/margins": 2.644257068634033, "rewards/rejected": 5.59097146987915, "step": 5962 }, { "epoch": 1.32, "learning_rate": 2.738995134547978e-06, "logits/chosen": -1.8901176452636719, "logits/rejected": -1.849516749382019, "logps/chosen": -42.40156555175781, "logps/rejected": -55.43024444580078, "loss": 0.1703, "rewards/accuracies": 1.0, "rewards/chosen": 4.628057956695557, "rewards/margins": 1.8789360523223877, "rewards/rejected": 2.749121904373169, "step": 5963 }, { "epoch": 1.32, "learning_rate": 2.737396673630326e-06, "logits/chosen": -1.818488597869873, "logits/rejected": -1.7310594320297241, "logps/chosen": -26.97629165649414, "logps/rejected": -5.416937351226807, "loss": 0.3652, "rewards/accuracies": 1.0, "rewards/chosen": 4.037748336791992, "rewards/margins": 3.271912097930908, "rewards/rejected": 0.7658362984657288, "step": 5964 }, { "epoch": 1.32, "learning_rate": 2.7357985034517632e-06, "logits/chosen": -1.6413791179656982, "logits/rejected": -1.6737006902694702, "logps/chosen": -18.993492126464844, "logps/rejected": -95.25718688964844, "loss": 1.0362, "rewards/accuracies": 0.0, "rewards/chosen": 4.362299919128418, "rewards/margins": -1.8624787330627441, "rewards/rejected": 6.224778652191162, "step": 5965 }, { "epoch": 1.32, "learning_rate": 2.734200624217646e-06, "logits/chosen": -1.918745517730713, "logits/rejected": -1.8734618425369263, "logps/chosen": -54.213565826416016, "logps/rejected": -54.08380889892578, "loss": 0.0664, "rewards/accuracies": 1.0, "rewards/chosen": 3.8136165142059326, "rewards/margins": 2.0326433181762695, "rewards/rejected": 1.7809730768203735, "step": 5966 }, { "epoch": 1.32, "learning_rate": 2.732603036133306e-06, "logits/chosen": -1.8238455057144165, "logits/rejected": -1.7450132369995117, "logps/chosen": -72.02525329589844, "logps/rejected": -64.16075897216797, "loss": 0.0737, "rewards/accuracies": 1.0, "rewards/chosen": 4.8580474853515625, "rewards/margins": 2.0803465843200684, "rewards/rejected": 2.777700901031494, "step": 5967 }, { "epoch": 1.32, "learning_rate": 2.731005739404021e-06, "logits/chosen": -1.5413726568222046, "logits/rejected": -1.4627233743667603, "logps/chosen": -17.553552627563477, "logps/rejected": -11.335684776306152, "loss": 0.1975, "rewards/accuracies": 1.0, "rewards/chosen": 1.8330286741256714, "rewards/margins": 0.8775188326835632, "rewards/rejected": 0.9555098414421082, "step": 5968 }, { "epoch": 1.32, "learning_rate": 2.7294087342350485e-06, "logits/chosen": -1.988096833229065, "logits/rejected": -1.8277250528335571, "logps/chosen": -105.12064361572266, "logps/rejected": -206.2529754638672, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 8.810206413269043, "rewards/margins": 3.912116527557373, "rewards/rejected": 4.89808988571167, "step": 5969 }, { "epoch": 1.32, "learning_rate": 2.7278120208315927e-06, "logits/chosen": -1.9498026371002197, "logits/rejected": -1.8979613780975342, "logps/chosen": -79.67692565917969, "logps/rejected": -86.28849792480469, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 9.610879898071289, "rewards/margins": 4.901960849761963, "rewards/rejected": 4.708919048309326, "step": 5970 }, { "epoch": 1.32, "learning_rate": 2.7262155993988303e-06, "logits/chosen": -1.8866174221038818, "logits/rejected": -1.8781291246414185, "logps/chosen": -16.267169952392578, "logps/rejected": -76.96796417236328, "loss": 2.9373, "rewards/accuracies": 0.0, "rewards/chosen": 1.150808334350586, "rewards/margins": -5.795795917510986, "rewards/rejected": 6.946604251861572, "step": 5971 }, { "epoch": 1.32, "learning_rate": 2.724619470141897e-06, "logits/chosen": -2.1398987770080566, "logits/rejected": -2.1474618911743164, "logps/chosen": -79.13368225097656, "logps/rejected": -115.58038330078125, "loss": 0.1987, "rewards/accuracies": 1.0, "rewards/chosen": 11.109410285949707, "rewards/margins": 1.5870304107666016, "rewards/rejected": 9.522379875183105, "step": 5972 }, { "epoch": 1.32, "learning_rate": 2.723023633265892e-06, "logits/chosen": -1.847934365272522, "logits/rejected": -1.897371768951416, "logps/chosen": -31.987287521362305, "logps/rejected": -102.24612426757812, "loss": 0.8434, "rewards/accuracies": 0.0, "rewards/chosen": 4.396331787109375, "rewards/margins": -1.4815917015075684, "rewards/rejected": 5.877923488616943, "step": 5973 }, { "epoch": 1.32, "learning_rate": 2.721428088975879e-06, "logits/chosen": -1.5387943983078003, "logits/rejected": -1.4588253498077393, "logps/chosen": -32.770912170410156, "logps/rejected": -13.802376747131348, "loss": 0.08, "rewards/accuracies": 1.0, "rewards/chosen": 2.395854949951172, "rewards/margins": 1.8660526275634766, "rewards/rejected": 0.5298022627830505, "step": 5974 }, { "epoch": 1.32, "learning_rate": 2.7198328374768747e-06, "logits/chosen": -2.069087505340576, "logits/rejected": -2.1081743240356445, "logps/chosen": -22.81734275817871, "logps/rejected": -128.2224578857422, "loss": 1.8547, "rewards/accuracies": 0.0, "rewards/chosen": 3.347088575363159, "rewards/margins": -2.9077484607696533, "rewards/rejected": 6.2548370361328125, "step": 5975 }, { "epoch": 1.32, "learning_rate": 2.7182378789738732e-06, "logits/chosen": -1.6030482053756714, "logits/rejected": -1.5490822792053223, "logps/chosen": -6.131948471069336, "logps/rejected": -16.608095169067383, "loss": 0.6101, "rewards/accuracies": 1.0, "rewards/chosen": 0.8723034858703613, "rewards/margins": 0.36523371934890747, "rewards/rejected": 0.5070697665214539, "step": 5976 }, { "epoch": 1.32, "learning_rate": 2.7166432136718156e-06, "logits/chosen": -2.102327346801758, "logits/rejected": -2.0514180660247803, "logps/chosen": -65.16802978515625, "logps/rejected": -61.43372344970703, "loss": 0.179, "rewards/accuracies": 1.0, "rewards/chosen": 5.609832286834717, "rewards/margins": 1.0634355545043945, "rewards/rejected": 4.546396732330322, "step": 5977 }, { "epoch": 1.32, "learning_rate": 2.7150488417756195e-06, "logits/chosen": -1.9437538385391235, "logits/rejected": -1.907476782798767, "logps/chosen": -43.829036712646484, "logps/rejected": -43.34031677246094, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 3.578756332397461, "rewards/margins": 1.586520791053772, "rewards/rejected": 1.992235541343689, "step": 5978 }, { "epoch": 1.32, "learning_rate": 2.7134547634901524e-06, "logits/chosen": -2.1371102333068848, "logits/rejected": -2.19034481048584, "logps/chosen": -226.62649536132812, "logps/rejected": -52.95158386230469, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": 5.531851291656494, "rewards/margins": 2.678370714187622, "rewards/rejected": 2.853480577468872, "step": 5979 }, { "epoch": 1.32, "learning_rate": 2.711860979020252e-06, "logits/chosen": -1.9284110069274902, "logits/rejected": -1.894513487815857, "logps/chosen": -72.64383697509766, "logps/rejected": -67.69917297363281, "loss": 0.1167, "rewards/accuracies": 1.0, "rewards/chosen": 6.357637882232666, "rewards/margins": 1.7314810752868652, "rewards/rejected": 4.626156806945801, "step": 5980 }, { "epoch": 1.32, "learning_rate": 2.7102674885707146e-06, "logits/chosen": -2.0540761947631836, "logits/rejected": -2.0607247352600098, "logps/chosen": -28.972803115844727, "logps/rejected": -20.717761993408203, "loss": 0.1841, "rewards/accuracies": 1.0, "rewards/chosen": 3.718634843826294, "rewards/margins": 1.266695499420166, "rewards/rejected": 2.451939344406128, "step": 5981 }, { "epoch": 1.32, "learning_rate": 2.7086742923463004e-06, "logits/chosen": -2.092827796936035, "logits/rejected": -2.021864414215088, "logps/chosen": -84.97612762451172, "logps/rejected": -80.53842163085938, "loss": 0.1089, "rewards/accuracies": 1.0, "rewards/chosen": 6.322757720947266, "rewards/margins": 2.3729164600372314, "rewards/rejected": 3.949841260910034, "step": 5982 }, { "epoch": 1.32, "learning_rate": 2.7070813905517347e-06, "logits/chosen": -2.0580990314483643, "logits/rejected": -1.5341907739639282, "logps/chosen": -87.24642944335938, "logps/rejected": -63.14913558959961, "loss": 0.2732, "rewards/accuracies": 1.0, "rewards/chosen": 6.177618503570557, "rewards/margins": 1.6642637252807617, "rewards/rejected": 4.513354778289795, "step": 5983 }, { "epoch": 1.32, "learning_rate": 2.7054887833916933e-06, "logits/chosen": -1.823319673538208, "logits/rejected": -1.9103049039840698, "logps/chosen": -60.30292510986328, "logps/rejected": -119.41615295410156, "loss": 0.42, "rewards/accuracies": 0.0, "rewards/chosen": 6.326737403869629, "rewards/margins": -0.19545984268188477, "rewards/rejected": 6.522197246551514, "step": 5984 }, { "epoch": 1.32, "learning_rate": 2.7038964710708315e-06, "logits/chosen": -1.9203531742095947, "logits/rejected": -1.859345555305481, "logps/chosen": -93.10302734375, "logps/rejected": -171.04689025878906, "loss": 0.1268, "rewards/accuracies": 1.0, "rewards/chosen": 11.425528526306152, "rewards/margins": 4.511977195739746, "rewards/rejected": 6.913551330566406, "step": 5985 }, { "epoch": 1.32, "learning_rate": 2.7023044537937494e-06, "logits/chosen": -1.8953558206558228, "logits/rejected": -1.9312385320663452, "logps/chosen": -59.384422302246094, "logps/rejected": -112.32937622070312, "loss": 0.147, "rewards/accuracies": 1.0, "rewards/chosen": 9.377741813659668, "rewards/margins": 1.820664882659912, "rewards/rejected": 7.557076930999756, "step": 5986 }, { "epoch": 1.33, "learning_rate": 2.700712731765025e-06, "logits/chosen": -1.9361350536346436, "logits/rejected": -1.9361350536346436, "logps/chosen": -44.64537811279297, "logps/rejected": -44.64537811279297, "loss": 0.4552, "rewards/accuracies": 0.0, "rewards/chosen": 4.460803985595703, "rewards/margins": 0.0, "rewards/rejected": 4.460803985595703, "step": 5987 }, { "epoch": 1.33, "learning_rate": 2.6991213051891863e-06, "logits/chosen": -1.9853107929229736, "logits/rejected": -1.8621482849121094, "logps/chosen": -123.32049560546875, "logps/rejected": -82.2205810546875, "loss": 0.1441, "rewards/accuracies": 1.0, "rewards/chosen": 9.3052396774292, "rewards/margins": 3.973928451538086, "rewards/rejected": 5.331311225891113, "step": 5988 }, { "epoch": 1.33, "learning_rate": 2.6975301742707282e-06, "logits/chosen": -2.0404722690582275, "logits/rejected": -2.0619349479675293, "logps/chosen": -174.69680786132812, "logps/rejected": -62.827789306640625, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 7.477357387542725, "rewards/margins": 2.9208173751831055, "rewards/rejected": 4.556540012359619, "step": 5989 }, { "epoch": 1.33, "learning_rate": 2.6959393392141075e-06, "logits/chosen": -2.181823968887329, "logits/rejected": -2.2240660190582275, "logps/chosen": -52.83317565917969, "logps/rejected": -156.23300170898438, "loss": 2.284, "rewards/accuracies": 0.0, "rewards/chosen": 4.208329677581787, "rewards/margins": -4.354673862457275, "rewards/rejected": 8.563003540039062, "step": 5990 }, { "epoch": 1.33, "learning_rate": 2.6943488002237435e-06, "logits/chosen": -1.621049404144287, "logits/rejected": -1.6348379850387573, "logps/chosen": -26.103351593017578, "logps/rejected": -39.060489654541016, "loss": 0.4388, "rewards/accuracies": 1.0, "rewards/chosen": 2.454002857208252, "rewards/margins": 0.16448521614074707, "rewards/rejected": 2.289517641067505, "step": 5991 }, { "epoch": 1.33, "learning_rate": 2.692758557504017e-06, "logits/chosen": -1.9313035011291504, "logits/rejected": -1.9262125492095947, "logps/chosen": -82.09968566894531, "logps/rejected": -154.03904724121094, "loss": 0.0447, "rewards/accuracies": 1.0, "rewards/chosen": 8.332859992980957, "rewards/margins": 2.471524238586426, "rewards/rejected": 5.861335754394531, "step": 5992 }, { "epoch": 1.33, "learning_rate": 2.6911686112592665e-06, "logits/chosen": -2.059124231338501, "logits/rejected": -2.093111276626587, "logps/chosen": -40.33714294433594, "logps/rejected": -157.55734252929688, "loss": 0.1703, "rewards/accuracies": 1.0, "rewards/chosen": 9.13459300994873, "rewards/margins": 1.012415885925293, "rewards/rejected": 8.122177124023438, "step": 5993 }, { "epoch": 1.33, "learning_rate": 2.689578961693803e-06, "logits/chosen": -2.178685188293457, "logits/rejected": -2.1559314727783203, "logps/chosen": -89.53124237060547, "logps/rejected": -42.469417572021484, "loss": 0.0738, "rewards/accuracies": 1.0, "rewards/chosen": 4.977121829986572, "rewards/margins": 1.8449978828430176, "rewards/rejected": 3.1321239471435547, "step": 5994 }, { "epoch": 1.33, "learning_rate": 2.6879896090118845e-06, "logits/chosen": -1.8692086935043335, "logits/rejected": -2.026517391204834, "logps/chosen": -98.87623596191406, "logps/rejected": -62.12318801879883, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": 7.797463893890381, "rewards/margins": 3.1897096633911133, "rewards/rejected": 4.607754230499268, "step": 5995 }, { "epoch": 1.33, "learning_rate": 2.6864005534177473e-06, "logits/chosen": -2.1443557739257812, "logits/rejected": -1.9780274629592896, "logps/chosen": -23.577571868896484, "logps/rejected": -126.73673248291016, "loss": 1.6363, "rewards/accuracies": 0.0, "rewards/chosen": 2.8492190837860107, "rewards/margins": -3.230287790298462, "rewards/rejected": 6.079506874084473, "step": 5996 }, { "epoch": 1.33, "learning_rate": 2.6848117951155758e-06, "logits/chosen": -1.6467314958572388, "logits/rejected": -1.599949836730957, "logps/chosen": -33.46588897705078, "logps/rejected": -23.041240692138672, "loss": 0.6173, "rewards/accuracies": 0.0, "rewards/chosen": 1.7621582746505737, "rewards/margins": -0.7417258024215698, "rewards/rejected": 2.5038840770721436, "step": 5997 }, { "epoch": 1.33, "learning_rate": 2.6832233343095225e-06, "logits/chosen": -2.2049896717071533, "logits/rejected": -2.1461117267608643, "logps/chosen": -51.229488372802734, "logps/rejected": -19.34178352355957, "loss": 0.1069, "rewards/accuracies": 1.0, "rewards/chosen": 3.4060306549072266, "rewards/margins": 1.4768266677856445, "rewards/rejected": 1.929203987121582, "step": 5998 }, { "epoch": 1.33, "learning_rate": 2.6816351712037013e-06, "logits/chosen": -2.0371944904327393, "logits/rejected": -1.9745434522628784, "logps/chosen": -118.8365478515625, "logps/rejected": -84.87052154541016, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 5.986151218414307, "rewards/margins": 3.821431875228882, "rewards/rejected": 2.164719343185425, "step": 5999 }, { "epoch": 1.33, "learning_rate": 2.680047306002188e-06, "logits/chosen": -2.0333478450775146, "logits/rejected": -2.0858893394470215, "logps/chosen": -55.30567169189453, "logps/rejected": -60.996498107910156, "loss": 0.3054, "rewards/accuracies": 1.0, "rewards/chosen": 5.641455173492432, "rewards/margins": 0.22323894500732422, "rewards/rejected": 5.418216228485107, "step": 6000 }, { "epoch": 1.33, "learning_rate": 2.6784597389090206e-06, "logits/chosen": -1.9715701341629028, "logits/rejected": -1.9715701341629028, "logps/chosen": -26.782230377197266, "logps/rejected": -26.782230377197266, "loss": 0.4361, "rewards/accuracies": 0.0, "rewards/chosen": 3.602773427963257, "rewards/margins": 0.0, "rewards/rejected": 3.602773427963257, "step": 6001 }, { "epoch": 1.33, "learning_rate": 2.6768724701281913e-06, "logits/chosen": -2.128533124923706, "logits/rejected": -2.1341657638549805, "logps/chosen": -43.75700378417969, "logps/rejected": -44.506561279296875, "loss": 0.6093, "rewards/accuracies": 0.0, "rewards/chosen": 3.374802350997925, "rewards/margins": -0.7560641765594482, "rewards/rejected": 4.130866527557373, "step": 6002 }, { "epoch": 1.33, "learning_rate": 2.6752854998636696e-06, "logits/chosen": -2.3421294689178467, "logits/rejected": -2.3302505016326904, "logps/chosen": -96.67411804199219, "logps/rejected": -41.26301956176758, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": 8.117171287536621, "rewards/margins": 5.612094879150391, "rewards/rejected": 2.5050761699676514, "step": 6003 }, { "epoch": 1.33, "learning_rate": 2.6736988283193686e-06, "logits/chosen": -1.881618857383728, "logits/rejected": -1.8741220235824585, "logps/chosen": -153.9097137451172, "logps/rejected": -70.67463684082031, "loss": 0.0989, "rewards/accuracies": 1.0, "rewards/chosen": 8.649307250976562, "rewards/margins": 1.5280804634094238, "rewards/rejected": 7.121226787567139, "step": 6004 }, { "epoch": 1.33, "learning_rate": 2.6721124556991797e-06, "logits/chosen": -2.1870622634887695, "logits/rejected": -2.1456634998321533, "logps/chosen": -73.9292984008789, "logps/rejected": -81.79679870605469, "loss": 0.1487, "rewards/accuracies": 1.0, "rewards/chosen": 5.483959197998047, "rewards/margins": 1.4913642406463623, "rewards/rejected": 3.9925949573516846, "step": 6005 }, { "epoch": 1.33, "learning_rate": 2.6705263822069427e-06, "logits/chosen": -2.0309560298919678, "logits/rejected": -2.0430259704589844, "logps/chosen": -38.00408172607422, "logps/rejected": -50.740814208984375, "loss": 0.2632, "rewards/accuracies": 1.0, "rewards/chosen": 3.64312744140625, "rewards/margins": 0.3670065402984619, "rewards/rejected": 3.276120901107788, "step": 6006 }, { "epoch": 1.33, "learning_rate": 2.668940608046465e-06, "logits/chosen": -2.145198345184326, "logits/rejected": -2.115298271179199, "logps/chosen": -96.96527099609375, "logps/rejected": -127.75994873046875, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": 9.628822326660156, "rewards/margins": 2.7157363891601562, "rewards/rejected": 6.9130859375, "step": 6007 }, { "epoch": 1.33, "learning_rate": 2.667355133421515e-06, "logits/chosen": -2.357593536376953, "logits/rejected": -2.358309268951416, "logps/chosen": -46.55543518066406, "logps/rejected": -23.103931427001953, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": 3.6764168739318848, "rewards/margins": 3.523792266845703, "rewards/rejected": 0.15262471139431, "step": 6008 }, { "epoch": 1.33, "learning_rate": 2.665769958535824e-06, "logits/chosen": -2.0324337482452393, "logits/rejected": -1.9666374921798706, "logps/chosen": -51.50065612792969, "logps/rejected": -114.5724105834961, "loss": 0.0634, "rewards/accuracies": 1.0, "rewards/chosen": 8.641838073730469, "rewards/margins": 2.1342062950134277, "rewards/rejected": 6.507631778717041, "step": 6009 }, { "epoch": 1.33, "learning_rate": 2.6641850835930836e-06, "logits/chosen": -1.893652081489563, "logits/rejected": -1.7419177293777466, "logps/chosen": -121.35980224609375, "logps/rejected": -47.75470733642578, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 7.143263339996338, "rewards/margins": 5.092442512512207, "rewards/rejected": 2.05082106590271, "step": 6010 }, { "epoch": 1.33, "learning_rate": 2.662600508796941e-06, "logits/chosen": -2.2234485149383545, "logits/rejected": -2.1649744510650635, "logps/chosen": -99.08518981933594, "logps/rejected": -41.16714859008789, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": 7.950596809387207, "rewards/margins": 3.560910701751709, "rewards/rejected": 4.389686107635498, "step": 6011 }, { "epoch": 1.33, "learning_rate": 2.6610162343510183e-06, "logits/chosen": -1.9596527814865112, "logits/rejected": -1.901526927947998, "logps/chosen": -47.46257019042969, "logps/rejected": -61.551239013671875, "loss": 0.2102, "rewards/accuracies": 1.0, "rewards/chosen": 2.9429352283477783, "rewards/margins": 0.7178192138671875, "rewards/rejected": 2.225116014480591, "step": 6012 }, { "epoch": 1.33, "learning_rate": 2.659432260458882e-06, "logits/chosen": -1.986280918121338, "logits/rejected": -1.986280918121338, "logps/chosen": -19.27716827392578, "logps/rejected": -19.27716827392578, "loss": 1.1465, "rewards/accuracies": 0.0, "rewards/chosen": 3.6327264308929443, "rewards/margins": 0.0, "rewards/rejected": 3.6327264308929443, "step": 6013 }, { "epoch": 1.33, "learning_rate": 2.6578485873240777e-06, "logits/chosen": -2.00476336479187, "logits/rejected": -2.00476336479187, "logps/chosen": -69.08751678466797, "logps/rejected": -69.08751678466797, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 7.924714088439941, "rewards/margins": 0.0, "rewards/rejected": 7.924714088439941, "step": 6014 }, { "epoch": 1.33, "learning_rate": 2.6562652151500976e-06, "logits/chosen": -2.353178024291992, "logits/rejected": -2.4165947437286377, "logps/chosen": -157.77528381347656, "logps/rejected": -62.60969161987305, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": 7.615060329437256, "rewards/margins": 3.744910717010498, "rewards/rejected": 3.870149612426758, "step": 6015 }, { "epoch": 1.33, "learning_rate": 2.6546821441404035e-06, "logits/chosen": -1.9176963567733765, "logits/rejected": -1.9357489347457886, "logps/chosen": -37.67930603027344, "logps/rejected": -42.62158966064453, "loss": 0.2842, "rewards/accuracies": 1.0, "rewards/chosen": 2.730419158935547, "rewards/margins": 0.34513306617736816, "rewards/rejected": 2.3852860927581787, "step": 6016 }, { "epoch": 1.33, "learning_rate": 2.653099374498416e-06, "logits/chosen": -1.8952367305755615, "logits/rejected": -1.5851161479949951, "logps/chosen": -56.190635681152344, "logps/rejected": -99.46315002441406, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 5.852036476135254, "rewards/margins": 2.615529775619507, "rewards/rejected": 3.236506700515747, "step": 6017 }, { "epoch": 1.33, "learning_rate": 2.651516906427517e-06, "logits/chosen": -2.043264389038086, "logits/rejected": -2.0479531288146973, "logps/chosen": -41.692169189453125, "logps/rejected": -87.54691314697266, "loss": 0.7764, "rewards/accuracies": 0.0, "rewards/chosen": 3.6647934913635254, "rewards/margins": -1.2142829895019531, "rewards/rejected": 4.8790764808654785, "step": 6018 }, { "epoch": 1.33, "learning_rate": 2.649934740131049e-06, "logits/chosen": -1.8492670059204102, "logits/rejected": -1.7861344814300537, "logps/chosen": -67.0510025024414, "logps/rejected": -30.192726135253906, "loss": 0.1538, "rewards/accuracies": 1.0, "rewards/chosen": 5.3174943923950195, "rewards/margins": 1.5699822902679443, "rewards/rejected": 3.747512102127075, "step": 6019 }, { "epoch": 1.33, "learning_rate": 2.6483528758123188e-06, "logits/chosen": -2.273573398590088, "logits/rejected": -2.2816367149353027, "logps/chosen": -55.831932067871094, "logps/rejected": -72.89797973632812, "loss": 0.1904, "rewards/accuracies": 1.0, "rewards/chosen": 4.349602699279785, "rewards/margins": 0.7747888565063477, "rewards/rejected": 3.5748138427734375, "step": 6020 }, { "epoch": 1.33, "learning_rate": 2.646771313674592e-06, "logits/chosen": -1.8342182636260986, "logits/rejected": -1.679218053817749, "logps/chosen": -153.86668395996094, "logps/rejected": -83.28459930419922, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 8.582972526550293, "rewards/margins": 5.882513999938965, "rewards/rejected": 2.700458526611328, "step": 6021 }, { "epoch": 1.33, "learning_rate": 2.6451900539210928e-06, "logits/chosen": -1.8271220922470093, "logits/rejected": -1.8693000078201294, "logps/chosen": -38.918495178222656, "logps/rejected": -86.89820098876953, "loss": 1.6641, "rewards/accuracies": 0.0, "rewards/chosen": 4.210789680480957, "rewards/margins": -3.2911229133605957, "rewards/rejected": 7.501912593841553, "step": 6022 }, { "epoch": 1.33, "learning_rate": 2.643609096755011e-06, "logits/chosen": -2.198444128036499, "logits/rejected": -2.2031466960906982, "logps/chosen": -125.54338073730469, "logps/rejected": -84.52050018310547, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 6.966975688934326, "rewards/margins": 3.5852205753326416, "rewards/rejected": 3.3817551136016846, "step": 6023 }, { "epoch": 1.33, "learning_rate": 2.6420284423794952e-06, "logits/chosen": -2.127692937850952, "logits/rejected": -2.148456573486328, "logps/chosen": -24.38926124572754, "logps/rejected": -51.545501708984375, "loss": 0.9154, "rewards/accuracies": 0.0, "rewards/chosen": 3.7660820484161377, "rewards/margins": -0.06595540046691895, "rewards/rejected": 3.8320374488830566, "step": 6024 }, { "epoch": 1.33, "learning_rate": 2.640448090997656e-06, "logits/chosen": -2.0085995197296143, "logits/rejected": -2.0394630432128906, "logps/chosen": -40.06264114379883, "logps/rejected": -58.36556625366211, "loss": 0.4513, "rewards/accuracies": 0.0, "rewards/chosen": 2.290781021118164, "rewards/margins": -0.35807037353515625, "rewards/rejected": 2.6488513946533203, "step": 6025 }, { "epoch": 1.33, "learning_rate": 2.6388680428125657e-06, "logits/chosen": -1.9338264465332031, "logits/rejected": -1.900061011314392, "logps/chosen": -64.30187225341797, "logps/rejected": -91.45413970947266, "loss": 0.1896, "rewards/accuracies": 1.0, "rewards/chosen": 5.023858547210693, "rewards/margins": 0.9863286018371582, "rewards/rejected": 4.037529945373535, "step": 6026 }, { "epoch": 1.33, "learning_rate": 2.6372882980272552e-06, "logits/chosen": -1.8305188417434692, "logits/rejected": -1.8305188417434692, "logps/chosen": -71.84410095214844, "logps/rejected": -71.84410095214844, "loss": 0.4949, "rewards/accuracies": 0.0, "rewards/chosen": 4.739555358886719, "rewards/margins": 0.0, "rewards/rejected": 4.739555358886719, "step": 6027 }, { "epoch": 1.33, "learning_rate": 2.635708856844719e-06, "logits/chosen": -1.8620322942733765, "logits/rejected": -1.810624599456787, "logps/chosen": -128.82872009277344, "logps/rejected": -49.39271926879883, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": 7.8587965965271, "rewards/margins": 4.16147518157959, "rewards/rejected": 3.697321653366089, "step": 6028 }, { "epoch": 1.33, "learning_rate": 2.634129719467911e-06, "logits/chosen": -2.0120716094970703, "logits/rejected": -2.049961805343628, "logps/chosen": -48.48997497558594, "logps/rejected": -94.4937744140625, "loss": 0.6669, "rewards/accuracies": 0.0, "rewards/chosen": 4.089534282684326, "rewards/margins": -1.0249557495117188, "rewards/rejected": 5.114490032196045, "step": 6029 }, { "epoch": 1.33, "learning_rate": 2.6325508860997482e-06, "logits/chosen": -1.7225450277328491, "logits/rejected": -1.6699172258377075, "logps/chosen": -47.31639099121094, "logps/rejected": -57.96244812011719, "loss": 1.0915, "rewards/accuracies": 0.0, "rewards/chosen": 3.3638107776641846, "rewards/margins": -0.13375163078308105, "rewards/rejected": 3.4975624084472656, "step": 6030 }, { "epoch": 1.33, "learning_rate": 2.6309723569431047e-06, "logits/chosen": -2.065760374069214, "logits/rejected": -2.077420949935913, "logps/chosen": -41.4673957824707, "logps/rejected": -39.113868713378906, "loss": 0.182, "rewards/accuracies": 1.0, "rewards/chosen": 3.541779041290283, "rewards/margins": 0.8628895282745361, "rewards/rejected": 2.678889513015747, "step": 6031 }, { "epoch": 1.34, "learning_rate": 2.6293941322008163e-06, "logits/chosen": -1.905326008796692, "logits/rejected": -1.8166065216064453, "logps/chosen": -36.72613525390625, "logps/rejected": -27.061899185180664, "loss": 0.4544, "rewards/accuracies": 0.0, "rewards/chosen": 4.211331367492676, "rewards/margins": -0.3250393867492676, "rewards/rejected": 4.536370754241943, "step": 6032 }, { "epoch": 1.34, "learning_rate": 2.6278162120756845e-06, "logits/chosen": -1.7556354999542236, "logits/rejected": -1.7045413255691528, "logps/chosen": -32.74547576904297, "logps/rejected": -29.629005432128906, "loss": 0.2424, "rewards/accuracies": 1.0, "rewards/chosen": 2.161273241043091, "rewards/margins": 0.6602745056152344, "rewards/rejected": 1.5009987354278564, "step": 6033 }, { "epoch": 1.34, "learning_rate": 2.626238596770467e-06, "logits/chosen": -1.8194974660873413, "logits/rejected": -1.78326416015625, "logps/chosen": -37.74174118041992, "logps/rejected": -20.806655883789062, "loss": 0.3556, "rewards/accuracies": 1.0, "rewards/chosen": 2.4199979305267334, "rewards/margins": 0.6780885457992554, "rewards/rejected": 1.741909384727478, "step": 6034 }, { "epoch": 1.34, "learning_rate": 2.624661286487883e-06, "logits/chosen": -2.160414695739746, "logits/rejected": -2.1401162147521973, "logps/chosen": -45.14411163330078, "logps/rejected": -12.638436317443848, "loss": 0.3695, "rewards/accuracies": 1.0, "rewards/chosen": 3.7759101390838623, "rewards/margins": 3.060464382171631, "rewards/rejected": 0.7154456377029419, "step": 6035 }, { "epoch": 1.34, "learning_rate": 2.6230842814306133e-06, "logits/chosen": -1.7506924867630005, "logits/rejected": -1.739336609840393, "logps/chosen": -35.85361099243164, "logps/rejected": -39.70854568481445, "loss": 0.5023, "rewards/accuracies": 1.0, "rewards/chosen": 2.997826099395752, "rewards/margins": 1.0064873695373535, "rewards/rejected": 1.9913387298583984, "step": 6036 }, { "epoch": 1.34, "learning_rate": 2.6215075818013005e-06, "logits/chosen": -2.1215553283691406, "logits/rejected": -2.0351436138153076, "logps/chosen": -117.80854797363281, "logps/rejected": -107.18075561523438, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 10.469611167907715, "rewards/margins": 4.946767330169678, "rewards/rejected": 5.522843837738037, "step": 6037 }, { "epoch": 1.34, "learning_rate": 2.6199311878025445e-06, "logits/chosen": -1.8672009706497192, "logits/rejected": -1.831937313079834, "logps/chosen": -52.9409065246582, "logps/rejected": -54.25270080566406, "loss": 0.225, "rewards/accuracies": 1.0, "rewards/chosen": 6.183196544647217, "rewards/margins": 1.6438407897949219, "rewards/rejected": 4.539355754852295, "step": 6038 }, { "epoch": 1.34, "learning_rate": 2.61835509963691e-06, "logits/chosen": -1.8686203956604004, "logits/rejected": -1.8611688613891602, "logps/chosen": -37.2769775390625, "logps/rejected": -47.70106506347656, "loss": 0.4393, "rewards/accuracies": 0.0, "rewards/chosen": 2.606137990951538, "rewards/margins": -0.2977933883666992, "rewards/rejected": 2.9039313793182373, "step": 6039 }, { "epoch": 1.34, "learning_rate": 2.616779317506921e-06, "logits/chosen": -1.7026010751724243, "logits/rejected": -1.7204068899154663, "logps/chosen": -38.17649841308594, "logps/rejected": -72.25090026855469, "loss": 1.5154, "rewards/accuracies": 0.0, "rewards/chosen": 2.5497817993164062, "rewards/margins": -2.358203887939453, "rewards/rejected": 4.907985687255859, "step": 6040 }, { "epoch": 1.34, "learning_rate": 2.61520384161506e-06, "logits/chosen": -1.8417054414749146, "logits/rejected": -1.7431451082229614, "logps/chosen": -105.4183349609375, "logps/rejected": -53.96391296386719, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 6.423758029937744, "rewards/margins": 5.3970627784729, "rewards/rejected": 1.0266952514648438, "step": 6041 }, { "epoch": 1.34, "learning_rate": 2.613628672163772e-06, "logits/chosen": -1.9628278017044067, "logits/rejected": -1.9531182050704956, "logps/chosen": -52.28877258300781, "logps/rejected": -45.822784423828125, "loss": 0.1542, "rewards/accuracies": 1.0, "rewards/chosen": 3.828148603439331, "rewards/margins": 1.037766933441162, "rewards/rejected": 2.790381669998169, "step": 6042 }, { "epoch": 1.34, "learning_rate": 2.6120538093554625e-06, "logits/chosen": -2.2261993885040283, "logits/rejected": -2.2190990447998047, "logps/chosen": -52.640647888183594, "logps/rejected": -69.14551544189453, "loss": 0.2095, "rewards/accuracies": 1.0, "rewards/chosen": 5.848973751068115, "rewards/margins": 2.332676649093628, "rewards/rejected": 3.5162971019744873, "step": 6043 }, { "epoch": 1.34, "learning_rate": 2.6104792533924984e-06, "logits/chosen": -1.7114726305007935, "logits/rejected": -1.6287697553634644, "logps/chosen": -34.06412124633789, "logps/rejected": -54.0181999206543, "loss": 0.1778, "rewards/accuracies": 1.0, "rewards/chosen": 3.34936261177063, "rewards/margins": 0.9379348754882812, "rewards/rejected": 2.4114277362823486, "step": 6044 }, { "epoch": 1.34, "learning_rate": 2.6089050044772057e-06, "logits/chosen": -2.1844048500061035, "logits/rejected": -2.1844048500061035, "logps/chosen": -45.757728576660156, "logps/rejected": -45.757728576660156, "loss": 0.4483, "rewards/accuracies": 0.0, "rewards/chosen": 3.3456802368164062, "rewards/margins": 0.0, "rewards/rejected": 3.3456802368164062, "step": 6045 }, { "epoch": 1.34, "learning_rate": 2.607331062811872e-06, "logits/chosen": -1.9545371532440186, "logits/rejected": -1.966002106666565, "logps/chosen": -53.67125701904297, "logps/rejected": -109.41270446777344, "loss": 0.1963, "rewards/accuracies": 1.0, "rewards/chosen": 3.937419891357422, "rewards/margins": 0.7796745300292969, "rewards/rejected": 3.157745361328125, "step": 6046 }, { "epoch": 1.34, "learning_rate": 2.6057574285987446e-06, "logits/chosen": -2.038106918334961, "logits/rejected": -1.975993275642395, "logps/chosen": -45.10414123535156, "logps/rejected": -11.441169738769531, "loss": 0.2402, "rewards/accuracies": 1.0, "rewards/chosen": 2.442885637283325, "rewards/margins": 1.0036274194717407, "rewards/rejected": 1.4392582178115845, "step": 6047 }, { "epoch": 1.34, "learning_rate": 2.6041841020400326e-06, "logits/chosen": -2.1652657985687256, "logits/rejected": -2.1653974056243896, "logps/chosen": -46.96498107910156, "logps/rejected": -85.755615234375, "loss": 0.2687, "rewards/accuracies": 1.0, "rewards/chosen": 5.049957275390625, "rewards/margins": 0.425018310546875, "rewards/rejected": 4.62493896484375, "step": 6048 }, { "epoch": 1.34, "learning_rate": 2.602611083337906e-06, "logits/chosen": -1.9313304424285889, "logits/rejected": -1.797049880027771, "logps/chosen": -142.95114135742188, "logps/rejected": -39.00498962402344, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 4.4806976318359375, "rewards/margins": 3.491471529006958, "rewards/rejected": 0.9892261624336243, "step": 6049 }, { "epoch": 1.34, "learning_rate": 2.601038372694488e-06, "logits/chosen": -1.8297098875045776, "logits/rejected": -1.8132812976837158, "logps/chosen": -45.62323760986328, "logps/rejected": -57.80878448486328, "loss": 2.9029, "rewards/accuracies": 1.0, "rewards/chosen": 6.319040775299072, "rewards/margins": 2.254988193511963, "rewards/rejected": 4.064052581787109, "step": 6050 }, { "epoch": 1.34, "learning_rate": 2.5994659703118757e-06, "logits/chosen": -1.6495652198791504, "logits/rejected": -1.7300554513931274, "logps/chosen": -30.696502685546875, "logps/rejected": -75.90007019042969, "loss": 1.3832, "rewards/accuracies": 0.0, "rewards/chosen": 5.0618896484375, "rewards/margins": -2.2444534301757812, "rewards/rejected": 7.306343078613281, "step": 6051 }, { "epoch": 1.34, "learning_rate": 2.5978938763921135e-06, "logits/chosen": -2.0255565643310547, "logits/rejected": -2.040931224822998, "logps/chosen": -59.29049301147461, "logps/rejected": -69.55430603027344, "loss": 0.1605, "rewards/accuracies": 1.0, "rewards/chosen": 3.932291030883789, "rewards/margins": 1.334273099899292, "rewards/rejected": 2.598017930984497, "step": 6052 }, { "epoch": 1.34, "learning_rate": 2.5963220911372145e-06, "logits/chosen": -2.207353115081787, "logits/rejected": -2.2458713054656982, "logps/chosen": -21.74410629272461, "logps/rejected": -163.0996856689453, "loss": 1.7782, "rewards/accuracies": 0.0, "rewards/chosen": 3.351332187652588, "rewards/margins": -3.466610908508301, "rewards/rejected": 6.817943096160889, "step": 6053 }, { "epoch": 1.34, "learning_rate": 2.594750614749148e-06, "logits/chosen": -1.8256592750549316, "logits/rejected": -1.6319217681884766, "logps/chosen": -88.82353210449219, "logps/rejected": -76.61895751953125, "loss": 0.1529, "rewards/accuracies": 1.0, "rewards/chosen": 7.970198154449463, "rewards/margins": 1.1212449073791504, "rewards/rejected": 6.8489532470703125, "step": 6054 }, { "epoch": 1.34, "learning_rate": 2.5931794474298456e-06, "logits/chosen": -1.863778829574585, "logits/rejected": -1.7327767610549927, "logps/chosen": -89.37291717529297, "logps/rejected": -46.659034729003906, "loss": 0.2471, "rewards/accuracies": 1.0, "rewards/chosen": 5.4589762687683105, "rewards/margins": 1.6961748600006104, "rewards/rejected": 3.7628014087677, "step": 6055 }, { "epoch": 1.34, "learning_rate": 2.5916085893811983e-06, "logits/chosen": -1.9489219188690186, "logits/rejected": -1.8321635723114014, "logps/chosen": -81.77705383300781, "logps/rejected": -79.61040496826172, "loss": 0.1182, "rewards/accuracies": 1.0, "rewards/chosen": 4.994604587554932, "rewards/margins": 3.369370460510254, "rewards/rejected": 1.6252342462539673, "step": 6056 }, { "epoch": 1.34, "learning_rate": 2.5900380408050575e-06, "logits/chosen": -1.737014651298523, "logits/rejected": -1.444966197013855, "logps/chosen": -30.019901275634766, "logps/rejected": -69.59660339355469, "loss": 2.3191, "rewards/accuracies": 0.0, "rewards/chosen": 3.0515995025634766, "rewards/margins": -3.1467785835266113, "rewards/rejected": 6.198378086090088, "step": 6057 }, { "epoch": 1.34, "learning_rate": 2.5884678019032372e-06, "logits/chosen": -2.035769462585449, "logits/rejected": -1.8553553819656372, "logps/chosen": -122.1229476928711, "logps/rejected": -21.955276489257812, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": 6.63042688369751, "rewards/margins": 5.121131896972656, "rewards/rejected": 1.509294867515564, "step": 6058 }, { "epoch": 1.34, "learning_rate": 2.586897872877503e-06, "logits/chosen": -2.03707218170166, "logits/rejected": -1.7979835271835327, "logps/chosen": -94.7822494506836, "logps/rejected": -13.323001861572266, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": 6.374213695526123, "rewards/margins": 5.189677715301514, "rewards/rejected": 1.1845359802246094, "step": 6059 }, { "epoch": 1.34, "learning_rate": 2.5853282539295953e-06, "logits/chosen": -2.032918691635132, "logits/rejected": -1.9890198707580566, "logps/chosen": -38.058712005615234, "logps/rejected": -93.25704193115234, "loss": 0.688, "rewards/accuracies": 0.0, "rewards/chosen": 7.124293804168701, "rewards/margins": -1.0819659233093262, "rewards/rejected": 8.206259727478027, "step": 6060 }, { "epoch": 1.34, "learning_rate": 2.5837589452612006e-06, "logits/chosen": -2.0256452560424805, "logits/rejected": -1.7901568412780762, "logps/chosen": -112.4233169555664, "logps/rejected": -52.95011901855469, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 6.80640172958374, "rewards/margins": 5.7395548820495605, "rewards/rejected": 1.0668468475341797, "step": 6061 }, { "epoch": 1.34, "learning_rate": 2.5821899470739723e-06, "logits/chosen": -1.8383060693740845, "logits/rejected": -1.8368483781814575, "logps/chosen": -34.085975646972656, "logps/rejected": -49.26980209350586, "loss": 0.5612, "rewards/accuracies": 1.0, "rewards/chosen": 2.99615478515625, "rewards/margins": 0.3041088581085205, "rewards/rejected": 2.6920459270477295, "step": 6062 }, { "epoch": 1.34, "learning_rate": 2.580621259569524e-06, "logits/chosen": -1.8767799139022827, "logits/rejected": -1.8760970830917358, "logps/chosen": -32.26634216308594, "logps/rejected": -28.727846145629883, "loss": 0.2645, "rewards/accuracies": 1.0, "rewards/chosen": 1.8262951374053955, "rewards/margins": 0.46200358867645264, "rewards/rejected": 1.3642915487289429, "step": 6063 }, { "epoch": 1.34, "learning_rate": 2.5790528829494278e-06, "logits/chosen": -1.7969462871551514, "logits/rejected": -1.6223169565200806, "logps/chosen": -94.7684097290039, "logps/rejected": -35.96173858642578, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 8.729543685913086, "rewards/margins": 5.017361640930176, "rewards/rejected": 3.712181806564331, "step": 6064 }, { "epoch": 1.34, "learning_rate": 2.5774848174152166e-06, "logits/chosen": -2.299487352371216, "logits/rejected": -2.259847640991211, "logps/chosen": -55.97223663330078, "logps/rejected": -10.364060401916504, "loss": 0.2631, "rewards/accuracies": 1.0, "rewards/chosen": 3.5017921924591064, "rewards/margins": 0.37320566177368164, "rewards/rejected": 3.128586530685425, "step": 6065 }, { "epoch": 1.34, "learning_rate": 2.575917063168382e-06, "logits/chosen": -2.076939821243286, "logits/rejected": -2.0739939212799072, "logps/chosen": -40.26862335205078, "logps/rejected": -46.401466369628906, "loss": 0.146, "rewards/accuracies": 1.0, "rewards/chosen": 3.304417371749878, "rewards/margins": 1.0833189487457275, "rewards/rejected": 2.2210984230041504, "step": 6066 }, { "epoch": 1.34, "learning_rate": 2.5743496204103803e-06, "logits/chosen": -2.1935975551605225, "logits/rejected": -2.1648263931274414, "logps/chosen": -78.55857849121094, "logps/rejected": -37.36520767211914, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 5.70955228805542, "rewards/margins": 2.2858541011810303, "rewards/rejected": 3.4236981868743896, "step": 6067 }, { "epoch": 1.34, "learning_rate": 2.572782489342617e-06, "logits/chosen": -2.0022571086883545, "logits/rejected": -1.9115244150161743, "logps/chosen": -50.324485778808594, "logps/rejected": -17.11402702331543, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": 3.712899923324585, "rewards/margins": 3.3921375274658203, "rewards/rejected": 0.3207624554634094, "step": 6068 }, { "epoch": 1.34, "learning_rate": 2.5712156701664724e-06, "logits/chosen": -1.8229323625564575, "logits/rejected": -1.8051912784576416, "logps/chosen": -47.227691650390625, "logps/rejected": -76.24879455566406, "loss": 0.2786, "rewards/accuracies": 1.0, "rewards/chosen": 3.62860107421875, "rewards/margins": 0.5724470615386963, "rewards/rejected": 3.0561540126800537, "step": 6069 }, { "epoch": 1.34, "learning_rate": 2.569649163083272e-06, "logits/chosen": -1.8744661808013916, "logits/rejected": -1.8180267810821533, "logps/chosen": -32.30843734741211, "logps/rejected": -21.428749084472656, "loss": 0.1327, "rewards/accuracies": 1.0, "rewards/chosen": 2.2086904048919678, "rewards/margins": 1.3102350234985352, "rewards/rejected": 0.8984554409980774, "step": 6070 }, { "epoch": 1.34, "learning_rate": 2.568082968294316e-06, "logits/chosen": -1.7548959255218506, "logits/rejected": -1.6828269958496094, "logps/chosen": -111.40960693359375, "logps/rejected": -80.4814453125, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 10.183300971984863, "rewards/margins": 4.892071723937988, "rewards/rejected": 5.291229248046875, "step": 6071 }, { "epoch": 1.34, "learning_rate": 2.5665170860008503e-06, "logits/chosen": -2.064033269882202, "logits/rejected": -1.8010947704315186, "logps/chosen": -31.562551498413086, "logps/rejected": -143.6083984375, "loss": 0.5172, "rewards/accuracies": 0.0, "rewards/chosen": 5.228545665740967, "rewards/margins": -0.5375523567199707, "rewards/rejected": 5.7660980224609375, "step": 6072 }, { "epoch": 1.34, "learning_rate": 2.5649515164040884e-06, "logits/chosen": -2.1106696128845215, "logits/rejected": -2.206130027770996, "logps/chosen": -86.26058197021484, "logps/rejected": -126.70597076416016, "loss": 0.2873, "rewards/accuracies": 1.0, "rewards/chosen": 11.66870403289795, "rewards/margins": 0.49689388275146484, "rewards/rejected": 11.171810150146484, "step": 6073 }, { "epoch": 1.34, "learning_rate": 2.5633862597052063e-06, "logits/chosen": -1.7971364259719849, "logits/rejected": -1.590631127357483, "logps/chosen": -91.78863525390625, "logps/rejected": -83.85931396484375, "loss": 0.3622, "rewards/accuracies": 1.0, "rewards/chosen": 8.05041217803955, "rewards/margins": 3.4509949684143066, "rewards/rejected": 4.599417209625244, "step": 6074 }, { "epoch": 1.34, "learning_rate": 2.5618213161053273e-06, "logits/chosen": -2.0222280025482178, "logits/rejected": -2.037122964859009, "logps/chosen": -29.95738983154297, "logps/rejected": -412.8065185546875, "loss": 4.4082, "rewards/accuracies": 0.0, "rewards/chosen": 6.333451271057129, "rewards/margins": -8.686580657958984, "rewards/rejected": 15.020031929016113, "step": 6075 }, { "epoch": 1.34, "learning_rate": 2.5602566858055522e-06, "logits/chosen": -1.8101110458374023, "logits/rejected": -1.7224838733673096, "logps/chosen": -42.467262268066406, "logps/rejected": -15.903643608093262, "loss": 0.06, "rewards/accuracies": 1.0, "rewards/chosen": 3.3429603576660156, "rewards/margins": 2.9125044345855713, "rewards/rejected": 0.43045589327812195, "step": 6076 }, { "epoch": 1.35, "learning_rate": 2.5586923690069244e-06, "logits/chosen": -1.8804587125778198, "logits/rejected": -1.9142305850982666, "logps/chosen": -40.634037017822266, "logps/rejected": -101.86739349365234, "loss": 0.582, "rewards/accuracies": 1.0, "rewards/chosen": 5.997689723968506, "rewards/margins": 0.5085654258728027, "rewards/rejected": 5.489124298095703, "step": 6077 }, { "epoch": 1.35, "learning_rate": 2.5571283659104617e-06, "logits/chosen": -1.8639516830444336, "logits/rejected": -1.9184774160385132, "logps/chosen": -68.93092346191406, "logps/rejected": -95.67942810058594, "loss": 0.4565, "rewards/accuracies": 0.0, "rewards/chosen": 7.747457981109619, "rewards/margins": -0.3724331855773926, "rewards/rejected": 8.119891166687012, "step": 6078 }, { "epoch": 1.35, "learning_rate": 2.5555646767171277e-06, "logits/chosen": -1.8847557306289673, "logits/rejected": -1.8847557306289673, "logps/chosen": -41.39739990234375, "logps/rejected": -41.39739990234375, "loss": 0.3553, "rewards/accuracies": 0.0, "rewards/chosen": 4.663900852203369, "rewards/margins": 0.0, "rewards/rejected": 4.663900852203369, "step": 6079 }, { "epoch": 1.35, "learning_rate": 2.554001301627861e-06, "logits/chosen": -2.0743207931518555, "logits/rejected": -2.072861909866333, "logps/chosen": -25.468994140625, "logps/rejected": -50.06324768066406, "loss": 0.5624, "rewards/accuracies": 1.0, "rewards/chosen": 3.399984836578369, "rewards/margins": 0.8389160633087158, "rewards/rejected": 2.5610687732696533, "step": 6080 }, { "epoch": 1.35, "learning_rate": 2.5524382408435446e-06, "logits/chosen": -1.8108116388320923, "logits/rejected": -1.8108116388320923, "logps/chosen": -54.06625747680664, "logps/rejected": -54.06625747680664, "loss": 0.3467, "rewards/accuracies": 0.0, "rewards/chosen": 5.244318962097168, "rewards/margins": 0.0, "rewards/rejected": 5.244318962097168, "step": 6081 }, { "epoch": 1.35, "learning_rate": 2.5508754945650305e-06, "logits/chosen": -2.1039695739746094, "logits/rejected": -2.046344518661499, "logps/chosen": -59.403995513916016, "logps/rejected": -11.0057373046875, "loss": 0.3138, "rewards/accuracies": 1.0, "rewards/chosen": 3.8642940521240234, "rewards/margins": 3.4153361320495605, "rewards/rejected": 0.4489578306674957, "step": 6082 }, { "epoch": 1.35, "learning_rate": 2.5493130629931297e-06, "logits/chosen": -1.8664969205856323, "logits/rejected": -1.8235238790512085, "logps/chosen": -90.99323272705078, "logps/rejected": -136.40896606445312, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 8.869925498962402, "rewards/margins": 2.9375858306884766, "rewards/rejected": 5.932339668273926, "step": 6083 }, { "epoch": 1.35, "learning_rate": 2.5477509463286053e-06, "logits/chosen": -2.2096924781799316, "logits/rejected": -2.2161881923675537, "logps/chosen": -98.63526916503906, "logps/rejected": -75.08799743652344, "loss": 0.0879, "rewards/accuracies": 1.0, "rewards/chosen": 5.458041667938232, "rewards/margins": 2.0860812664031982, "rewards/rejected": 3.371960401535034, "step": 6084 }, { "epoch": 1.35, "learning_rate": 2.5461891447721932e-06, "logits/chosen": -1.803019642829895, "logits/rejected": -1.7709325551986694, "logps/chosen": -33.70448684692383, "logps/rejected": -44.655792236328125, "loss": 1.1308, "rewards/accuracies": 1.0, "rewards/chosen": 2.3643314838409424, "rewards/margins": 0.8319309949874878, "rewards/rejected": 1.5324004888534546, "step": 6085 }, { "epoch": 1.35, "learning_rate": 2.5446276585245733e-06, "logits/chosen": -1.872827172279358, "logits/rejected": -1.8484580516815186, "logps/chosen": -32.124542236328125, "logps/rejected": -50.771339416503906, "loss": 0.2234, "rewards/accuracies": 1.0, "rewards/chosen": 3.020979404449463, "rewards/margins": 0.586113691329956, "rewards/rejected": 2.434865713119507, "step": 6086 }, { "epoch": 1.35, "learning_rate": 2.5430664877863997e-06, "logits/chosen": -2.1442036628723145, "logits/rejected": -2.1118788719177246, "logps/chosen": -48.63976287841797, "logps/rejected": -57.009037017822266, "loss": 0.2328, "rewards/accuracies": 1.0, "rewards/chosen": 4.00673770904541, "rewards/margins": 1.0492079257965088, "rewards/rejected": 2.9575297832489014, "step": 6087 }, { "epoch": 1.35, "learning_rate": 2.541505632758273e-06, "logits/chosen": -2.1316404342651367, "logits/rejected": -2.068040132522583, "logps/chosen": -74.72132110595703, "logps/rejected": -127.36775970458984, "loss": 0.1754, "rewards/accuracies": 1.0, "rewards/chosen": 7.507110118865967, "rewards/margins": 0.919896125793457, "rewards/rejected": 6.58721399307251, "step": 6088 }, { "epoch": 1.35, "learning_rate": 2.5399450936407665e-06, "logits/chosen": -2.291738271713257, "logits/rejected": -2.333444595336914, "logps/chosen": -111.45152282714844, "logps/rejected": -128.67730712890625, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 8.428171157836914, "rewards/margins": 4.475519180297852, "rewards/rejected": 3.9526519775390625, "step": 6089 }, { "epoch": 1.35, "learning_rate": 2.5383848706344e-06, "logits/chosen": -1.9981708526611328, "logits/rejected": -1.981527328491211, "logps/chosen": -88.61447143554688, "logps/rejected": -52.35980987548828, "loss": 0.1751, "rewards/accuracies": 1.0, "rewards/chosen": 4.795304775238037, "rewards/margins": 0.9044044017791748, "rewards/rejected": 3.8909003734588623, "step": 6090 }, { "epoch": 1.35, "learning_rate": 2.536824963939659e-06, "logits/chosen": -1.8305143117904663, "logits/rejected": -1.760556936264038, "logps/chosen": -46.1053466796875, "logps/rejected": -68.31365203857422, "loss": 0.3959, "rewards/accuracies": 1.0, "rewards/chosen": 3.6776046752929688, "rewards/margins": 0.7944777011871338, "rewards/rejected": 2.883126974105835, "step": 6091 }, { "epoch": 1.35, "learning_rate": 2.5352653737569922e-06, "logits/chosen": -2.2737576961517334, "logits/rejected": -2.2268271446228027, "logps/chosen": -71.41300964355469, "logps/rejected": -162.0093994140625, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": 10.178736686706543, "rewards/margins": 2.9978251457214355, "rewards/rejected": 7.180911540985107, "step": 6092 }, { "epoch": 1.35, "learning_rate": 2.533706100286795e-06, "logits/chosen": -2.0198028087615967, "logits/rejected": -2.0198028087615967, "logps/chosen": -46.400718688964844, "logps/rejected": -46.400718688964844, "loss": 0.3597, "rewards/accuracies": 0.0, "rewards/chosen": 1.697649359703064, "rewards/margins": 0.0, "rewards/rejected": 1.697649359703064, "step": 6093 }, { "epoch": 1.35, "learning_rate": 2.532147143729439e-06, "logits/chosen": -1.9666109085083008, "logits/rejected": -1.8855392932891846, "logps/chosen": -80.88854217529297, "logps/rejected": -39.76124954223633, "loss": 0.1346, "rewards/accuracies": 1.0, "rewards/chosen": 7.646836280822754, "rewards/margins": 2.4383764266967773, "rewards/rejected": 5.208459854125977, "step": 6094 }, { "epoch": 1.35, "learning_rate": 2.5305885042852385e-06, "logits/chosen": -1.9289436340332031, "logits/rejected": -1.9289436340332031, "logps/chosen": -45.038978576660156, "logps/rejected": -45.038978576660156, "loss": 0.5577, "rewards/accuracies": 0.0, "rewards/chosen": 4.681472301483154, "rewards/margins": 0.0, "rewards/rejected": 4.681472301483154, "step": 6095 }, { "epoch": 1.35, "learning_rate": 2.5290301821544826e-06, "logits/chosen": -1.9301871061325073, "logits/rejected": -1.9283770322799683, "logps/chosen": -67.88738250732422, "logps/rejected": -39.06594467163086, "loss": 0.287, "rewards/accuracies": 1.0, "rewards/chosen": 3.2658638954162598, "rewards/margins": 0.2976512908935547, "rewards/rejected": 2.968212604522705, "step": 6096 }, { "epoch": 1.35, "learning_rate": 2.5274721775374074e-06, "logits/chosen": -1.8899751901626587, "logits/rejected": -1.8899751901626587, "logps/chosen": -67.97669982910156, "logps/rejected": -67.97669982910156, "loss": 0.3835, "rewards/accuracies": 0.0, "rewards/chosen": 1.7354011535644531, "rewards/margins": 0.0, "rewards/rejected": 1.7354011535644531, "step": 6097 }, { "epoch": 1.35, "learning_rate": 2.5259144906342126e-06, "logits/chosen": -1.7021008729934692, "logits/rejected": -1.6354209184646606, "logps/chosen": -101.15132141113281, "logps/rejected": -47.519927978515625, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 5.885371685028076, "rewards/margins": 3.26007342338562, "rewards/rejected": 2.625298261642456, "step": 6098 }, { "epoch": 1.35, "learning_rate": 2.5243571216450585e-06, "logits/chosen": -1.7456672191619873, "logits/rejected": -1.7848793268203735, "logps/chosen": -55.13818359375, "logps/rejected": -118.15664672851562, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": 6.467635631561279, "rewards/margins": 2.3345437049865723, "rewards/rejected": 4.133091926574707, "step": 6099 }, { "epoch": 1.35, "learning_rate": 2.522800070770063e-06, "logits/chosen": -1.994645595550537, "logits/rejected": -1.9638663530349731, "logps/chosen": -120.84093475341797, "logps/rejected": -6.381191730499268, "loss": 0.0802, "rewards/accuracies": 1.0, "rewards/chosen": 7.053386688232422, "rewards/margins": 5.7244768142700195, "rewards/rejected": 1.3289098739624023, "step": 6100 }, { "epoch": 1.35, "learning_rate": 2.521243338209304e-06, "logits/chosen": -2.135564088821411, "logits/rejected": -2.1298792362213135, "logps/chosen": -33.085121154785156, "logps/rejected": -53.22475051879883, "loss": 0.6009, "rewards/accuracies": 0.0, "rewards/chosen": 2.852214813232422, "rewards/margins": -0.8225681781768799, "rewards/rejected": 3.6747829914093018, "step": 6101 }, { "epoch": 1.35, "learning_rate": 2.5196869241628165e-06, "logits/chosen": -1.838789463043213, "logits/rejected": -1.8598904609680176, "logps/chosen": -101.61296844482422, "logps/rejected": -112.05811309814453, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 7.40065860748291, "rewards/margins": 2.1947145462036133, "rewards/rejected": 5.205944061279297, "step": 6102 }, { "epoch": 1.35, "learning_rate": 2.5181308288305985e-06, "logits/chosen": -1.6963903903961182, "logits/rejected": -1.6273249387741089, "logps/chosen": -101.34310913085938, "logps/rejected": -89.7685546875, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": 6.721086025238037, "rewards/margins": 3.596276044845581, "rewards/rejected": 3.124809980392456, "step": 6103 }, { "epoch": 1.35, "learning_rate": 2.5165750524125994e-06, "logits/chosen": -2.021064519882202, "logits/rejected": -1.948904275894165, "logps/chosen": -56.81262969970703, "logps/rejected": -50.849525451660156, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 5.995861053466797, "rewards/margins": 2.526442766189575, "rewards/rejected": 3.4694182872772217, "step": 6104 }, { "epoch": 1.35, "learning_rate": 2.5150195951087396e-06, "logits/chosen": -1.9670881032943726, "logits/rejected": -1.9207321405410767, "logps/chosen": -155.92967224121094, "logps/rejected": -69.37297058105469, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": 8.372795104980469, "rewards/margins": 3.6184706687927246, "rewards/rejected": 4.754324436187744, "step": 6105 }, { "epoch": 1.35, "learning_rate": 2.513464457118886e-06, "logits/chosen": -1.8683518171310425, "logits/rejected": -1.8938783407211304, "logps/chosen": -45.68035125732422, "logps/rejected": -80.68114471435547, "loss": 0.284, "rewards/accuracies": 1.0, "rewards/chosen": 4.188625335693359, "rewards/margins": 0.9459350109100342, "rewards/rejected": 3.242690324783325, "step": 6106 }, { "epoch": 1.35, "learning_rate": 2.5119096386428727e-06, "logits/chosen": -1.766434907913208, "logits/rejected": -1.6434440612792969, "logps/chosen": -52.501827239990234, "logps/rejected": -70.10359191894531, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 7.118360996246338, "rewards/margins": 3.0690040588378906, "rewards/rejected": 4.049356937408447, "step": 6107 }, { "epoch": 1.35, "learning_rate": 2.510355139880489e-06, "logits/chosen": -1.8662186861038208, "logits/rejected": -1.6885919570922852, "logps/chosen": -134.74314880371094, "logps/rejected": -54.726993560791016, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": 7.038191318511963, "rewards/margins": 3.1258504390716553, "rewards/rejected": 3.9123408794403076, "step": 6108 }, { "epoch": 1.35, "learning_rate": 2.5088009610314855e-06, "logits/chosen": -2.23374342918396, "logits/rejected": -2.1614720821380615, "logps/chosen": -50.03804397583008, "logps/rejected": -56.23225402832031, "loss": 0.0782, "rewards/accuracies": 1.0, "rewards/chosen": 4.681735038757324, "rewards/margins": 1.8431806564331055, "rewards/rejected": 2.8385543823242188, "step": 6109 }, { "epoch": 1.35, "learning_rate": 2.5072471022955703e-06, "logits/chosen": -2.059678316116333, "logits/rejected": -2.0734801292419434, "logps/chosen": -34.074745178222656, "logps/rejected": -61.06388854980469, "loss": 1.0116, "rewards/accuracies": 0.0, "rewards/chosen": 3.9598357677459717, "rewards/margins": -1.5300109386444092, "rewards/rejected": 5.489846706390381, "step": 6110 }, { "epoch": 1.35, "learning_rate": 2.505693563872409e-06, "logits/chosen": -1.9882116317749023, "logits/rejected": -1.9883532524108887, "logps/chosen": -53.10600662231445, "logps/rejected": -58.89828109741211, "loss": 0.4409, "rewards/accuracies": 1.0, "rewards/chosen": 4.812435626983643, "rewards/margins": 0.04208230972290039, "rewards/rejected": 4.770353317260742, "step": 6111 }, { "epoch": 1.35, "learning_rate": 2.504140345961631e-06, "logits/chosen": -2.0698187351226807, "logits/rejected": -2.001585006713867, "logps/chosen": -50.41252899169922, "logps/rejected": -123.41702270507812, "loss": 0.4041, "rewards/accuracies": 1.0, "rewards/chosen": 7.994603633880615, "rewards/margins": 0.05110979080200195, "rewards/rejected": 7.943493843078613, "step": 6112 }, { "epoch": 1.35, "learning_rate": 2.5025874487628143e-06, "logits/chosen": -1.9060200452804565, "logits/rejected": -1.8820843696594238, "logps/chosen": -45.039794921875, "logps/rejected": -53.051063537597656, "loss": 1.7076, "rewards/accuracies": 1.0, "rewards/chosen": 2.79978346824646, "rewards/margins": 1.1697160005569458, "rewards/rejected": 1.6300674676895142, "step": 6113 }, { "epoch": 1.35, "learning_rate": 2.5010348724755117e-06, "logits/chosen": -1.821974754333496, "logits/rejected": -1.801032543182373, "logps/chosen": -35.26076126098633, "logps/rejected": -34.89817810058594, "loss": 0.3994, "rewards/accuracies": 1.0, "rewards/chosen": 3.0727245807647705, "rewards/margins": 0.8379878997802734, "rewards/rejected": 2.234736680984497, "step": 6114 }, { "epoch": 1.35, "learning_rate": 2.499482617299218e-06, "logits/chosen": -1.9082975387573242, "logits/rejected": -1.8756740093231201, "logps/chosen": -52.031028747558594, "logps/rejected": -55.040191650390625, "loss": 0.1401, "rewards/accuracies": 1.0, "rewards/chosen": 4.578510284423828, "rewards/margins": 1.1300499439239502, "rewards/rejected": 3.448460340499878, "step": 6115 }, { "epoch": 1.35, "learning_rate": 2.4979306834333965e-06, "logits/chosen": -1.289209008216858, "logits/rejected": -1.2374343872070312, "logps/chosen": -23.266040802001953, "logps/rejected": -36.982078552246094, "loss": 0.2686, "rewards/accuracies": 1.0, "rewards/chosen": 3.2067601680755615, "rewards/margins": 0.49932384490966797, "rewards/rejected": 2.7074363231658936, "step": 6116 }, { "epoch": 1.35, "learning_rate": 2.4963790710774683e-06, "logits/chosen": -1.8147352933883667, "logits/rejected": -1.7636380195617676, "logps/chosen": -53.372772216796875, "logps/rejected": -36.59457778930664, "loss": 0.2733, "rewards/accuracies": 1.0, "rewards/chosen": 3.370257616043091, "rewards/margins": 0.3695340156555176, "rewards/rejected": 3.0007236003875732, "step": 6117 }, { "epoch": 1.35, "learning_rate": 2.494827780430811e-06, "logits/chosen": -1.943642497062683, "logits/rejected": -1.8685922622680664, "logps/chosen": -38.38616180419922, "logps/rejected": -57.07051467895508, "loss": 0.119, "rewards/accuracies": 1.0, "rewards/chosen": 4.387563228607178, "rewards/margins": 1.7170534133911133, "rewards/rejected": 2.6705098152160645, "step": 6118 }, { "epoch": 1.35, "learning_rate": 2.493276811692761e-06, "logits/chosen": -2.230882167816162, "logits/rejected": -2.219965934753418, "logps/chosen": -83.96647644042969, "logps/rejected": -150.24014282226562, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": 7.51254415512085, "rewards/margins": 3.3544235229492188, "rewards/rejected": 4.158120632171631, "step": 6119 }, { "epoch": 1.35, "learning_rate": 2.4917261650626145e-06, "logits/chosen": -1.8335895538330078, "logits/rejected": -1.7922708988189697, "logps/chosen": -45.937843322753906, "logps/rejected": -21.99018096923828, "loss": 0.277, "rewards/accuracies": 1.0, "rewards/chosen": 3.4242653846740723, "rewards/margins": 0.3681790828704834, "rewards/rejected": 3.056086301803589, "step": 6120 }, { "epoch": 1.35, "learning_rate": 2.4901758407396265e-06, "logits/chosen": -1.892612338066101, "logits/rejected": -1.9109638929367065, "logps/chosen": -2.219008445739746, "logps/rejected": -11.097646713256836, "loss": 0.3493, "rewards/accuracies": 1.0, "rewards/chosen": 0.5226824879646301, "rewards/margins": 0.11888915300369263, "rewards/rejected": 0.4037933349609375, "step": 6121 }, { "epoch": 1.36, "learning_rate": 2.4886258389230094e-06, "logits/chosen": -2.128401279449463, "logits/rejected": -2.053532361984253, "logps/chosen": -161.89854431152344, "logps/rejected": -107.07872009277344, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 9.114706993103027, "rewards/margins": 3.249753475189209, "rewards/rejected": 5.864953517913818, "step": 6122 }, { "epoch": 1.36, "learning_rate": 2.487076159811937e-06, "logits/chosen": -1.5963839292526245, "logits/rejected": -1.6650450229644775, "logps/chosen": -23.22261619567871, "logps/rejected": -74.50460815429688, "loss": 1.4097, "rewards/accuracies": 0.0, "rewards/chosen": 4.1265482902526855, "rewards/margins": -2.615126609802246, "rewards/rejected": 6.741674900054932, "step": 6123 }, { "epoch": 1.36, "learning_rate": 2.4855268036055346e-06, "logits/chosen": -2.0758724212646484, "logits/rejected": -2.0528693199157715, "logps/chosen": -103.20379638671875, "logps/rejected": -60.52786636352539, "loss": 0.2116, "rewards/accuracies": 1.0, "rewards/chosen": 6.230551242828369, "rewards/margins": 2.911417007446289, "rewards/rejected": 3.31913423538208, "step": 6124 }, { "epoch": 1.36, "learning_rate": 2.483977770502894e-06, "logits/chosen": -1.7393947839736938, "logits/rejected": -1.62599778175354, "logps/chosen": -146.53123474121094, "logps/rejected": -60.16514205932617, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": 5.733131408691406, "rewards/margins": 4.165766716003418, "rewards/rejected": 1.5673649311065674, "step": 6125 }, { "epoch": 1.36, "learning_rate": 2.482429060703061e-06, "logits/chosen": -1.8820443153381348, "logits/rejected": -1.8173335790634155, "logps/chosen": -105.66853332519531, "logps/rejected": -51.40269470214844, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": 6.908378601074219, "rewards/margins": 3.0939886569976807, "rewards/rejected": 3.814389944076538, "step": 6126 }, { "epoch": 1.36, "learning_rate": 2.4808806744050425e-06, "logits/chosen": -2.1060421466827393, "logits/rejected": -2.016731023788452, "logps/chosen": -67.82432556152344, "logps/rejected": -28.55631446838379, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": 4.211529731750488, "rewards/margins": 1.8840155601501465, "rewards/rejected": 2.327514171600342, "step": 6127 }, { "epoch": 1.36, "learning_rate": 2.4793326118078004e-06, "logits/chosen": -2.2264623641967773, "logits/rejected": -2.203598976135254, "logps/chosen": -40.17639923095703, "logps/rejected": -12.122575759887695, "loss": 0.4493, "rewards/accuracies": 1.0, "rewards/chosen": 3.615454912185669, "rewards/margins": 0.807016134262085, "rewards/rejected": 2.808438777923584, "step": 6128 }, { "epoch": 1.36, "learning_rate": 2.477784873110259e-06, "logits/chosen": -1.89602530002594, "logits/rejected": -1.9123700857162476, "logps/chosen": -71.64909362792969, "logps/rejected": -92.9925308227539, "loss": 0.522, "rewards/accuracies": 0.0, "rewards/chosen": 4.891238689422607, "rewards/margins": -0.6065711975097656, "rewards/rejected": 5.497809886932373, "step": 6129 }, { "epoch": 1.36, "learning_rate": 2.4762374585112973e-06, "logits/chosen": -1.875811219215393, "logits/rejected": -1.8110679388046265, "logps/chosen": -57.79518127441406, "logps/rejected": -49.51100540161133, "loss": 0.0889, "rewards/accuracies": 1.0, "rewards/chosen": 2.1845359802246094, "rewards/margins": 1.7642711400985718, "rewards/rejected": 0.4202648103237152, "step": 6130 }, { "epoch": 1.36, "learning_rate": 2.4746903682097563e-06, "logits/chosen": -1.7098084688186646, "logits/rejected": -1.6843332052230835, "logps/chosen": -53.31191635131836, "logps/rejected": -49.632041931152344, "loss": 0.0811, "rewards/accuracies": 1.0, "rewards/chosen": 4.021464824676514, "rewards/margins": 2.0548365116119385, "rewards/rejected": 1.9666283130645752, "step": 6131 }, { "epoch": 1.36, "learning_rate": 2.4731436024044337e-06, "logits/chosen": -1.9807970523834229, "logits/rejected": -1.8329144716262817, "logps/chosen": -147.30615234375, "logps/rejected": -22.58369255065918, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": 5.570898532867432, "rewards/margins": 5.024661540985107, "rewards/rejected": 0.5462371706962585, "step": 6132 }, { "epoch": 1.36, "learning_rate": 2.471597161294082e-06, "logits/chosen": -1.9999045133590698, "logits/rejected": -2.03905987739563, "logps/chosen": -86.43344116210938, "logps/rejected": -164.34121704101562, "loss": 0.216, "rewards/accuracies": 1.0, "rewards/chosen": 7.783447265625, "rewards/margins": 0.8012280464172363, "rewards/rejected": 6.982219219207764, "step": 6133 }, { "epoch": 1.36, "learning_rate": 2.4700510450774167e-06, "logits/chosen": -1.7043579816818237, "logits/rejected": -1.5328490734100342, "logps/chosen": -62.75528335571289, "logps/rejected": -55.5627555847168, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": 8.028032302856445, "rewards/margins": 2.8447952270507812, "rewards/rejected": 5.183237075805664, "step": 6134 }, { "epoch": 1.36, "learning_rate": 2.4685052539531108e-06, "logits/chosen": -2.0308737754821777, "logits/rejected": -1.621187686920166, "logps/chosen": -41.6192741394043, "logps/rejected": -44.7675666809082, "loss": 0.6212, "rewards/accuracies": 1.0, "rewards/chosen": 3.390779495239258, "rewards/margins": 0.14832067489624023, "rewards/rejected": 3.2424588203430176, "step": 6135 }, { "epoch": 1.36, "learning_rate": 2.466959788119794e-06, "logits/chosen": -1.742300271987915, "logits/rejected": -1.730210304260254, "logps/chosen": -165.60845947265625, "logps/rejected": -98.193115234375, "loss": 0.79, "rewards/accuracies": 0.0, "rewards/chosen": 7.107362270355225, "rewards/margins": -1.3489975929260254, "rewards/rejected": 8.45635986328125, "step": 6136 }, { "epoch": 1.36, "learning_rate": 2.465414647776055e-06, "logits/chosen": -2.260080099105835, "logits/rejected": -2.1656625270843506, "logps/chosen": -146.14743041992188, "logps/rejected": -21.757884979248047, "loss": 0.1043, "rewards/accuracies": 1.0, "rewards/chosen": 3.6212158203125, "rewards/margins": 2.3111648559570312, "rewards/rejected": 1.3100509643554688, "step": 6137 }, { "epoch": 1.36, "learning_rate": 2.4638698331204404e-06, "logits/chosen": -1.7663830518722534, "logits/rejected": -1.637774109840393, "logps/chosen": -66.34478759765625, "logps/rejected": -84.30862426757812, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 7.510667324066162, "rewards/margins": 5.38441276550293, "rewards/rejected": 2.1262543201446533, "step": 6138 }, { "epoch": 1.36, "learning_rate": 2.462325344351456e-06, "logits/chosen": -1.8349635601043701, "logits/rejected": -1.5881952047348022, "logps/chosen": -162.48324584960938, "logps/rejected": -71.08000946044922, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 8.204787254333496, "rewards/margins": 3.794698715209961, "rewards/rejected": 4.410088539123535, "step": 6139 }, { "epoch": 1.36, "learning_rate": 2.460781181667565e-06, "logits/chosen": -2.0783989429473877, "logits/rejected": -2.044543981552124, "logps/chosen": -120.73570251464844, "logps/rejected": -45.321739196777344, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 6.223747253417969, "rewards/margins": 2.5189154148101807, "rewards/rejected": 3.704831838607788, "step": 6140 }, { "epoch": 1.36, "learning_rate": 2.4592373452671867e-06, "logits/chosen": -2.0289764404296875, "logits/rejected": -2.0837111473083496, "logps/chosen": -33.537601470947266, "logps/rejected": -94.39239501953125, "loss": 1.9054, "rewards/accuracies": 0.0, "rewards/chosen": 2.1182796955108643, "rewards/margins": -3.072326898574829, "rewards/rejected": 5.190606594085693, "step": 6141 }, { "epoch": 1.36, "learning_rate": 2.4576938353487044e-06, "logits/chosen": -1.6930506229400635, "logits/rejected": -1.6310375928878784, "logps/chosen": -56.847755432128906, "logps/rejected": -70.7836685180664, "loss": 0.5056, "rewards/accuracies": 1.0, "rewards/chosen": 2.877195119857788, "rewards/margins": 0.9823052883148193, "rewards/rejected": 1.8948898315429688, "step": 6142 }, { "epoch": 1.36, "learning_rate": 2.456150652110451e-06, "logits/chosen": -1.843219518661499, "logits/rejected": -1.7725497484207153, "logps/chosen": -81.6143569946289, "logps/rejected": -47.288536071777344, "loss": 0.3041, "rewards/accuracies": 1.0, "rewards/chosen": 4.957804203033447, "rewards/margins": 1.7663140296936035, "rewards/rejected": 3.1914901733398438, "step": 6143 }, { "epoch": 1.36, "learning_rate": 2.454607795750724e-06, "logits/chosen": -1.6723226308822632, "logits/rejected": -1.6846143007278442, "logps/chosen": -34.078575134277344, "logps/rejected": -53.275367736816406, "loss": 0.6465, "rewards/accuracies": 0.0, "rewards/chosen": 3.7216622829437256, "rewards/margins": -0.8925206661224365, "rewards/rejected": 4.614182949066162, "step": 6144 }, { "epoch": 1.36, "learning_rate": 2.453065266467775e-06, "logits/chosen": -1.811516284942627, "logits/rejected": -1.8506239652633667, "logps/chosen": -31.223237991333008, "logps/rejected": -62.03955078125, "loss": 0.4698, "rewards/accuracies": 0.0, "rewards/chosen": 2.982562780380249, "rewards/margins": -0.385174036026001, "rewards/rejected": 3.36773681640625, "step": 6145 }, { "epoch": 1.36, "learning_rate": 2.4515230644598177e-06, "logits/chosen": -1.9243274927139282, "logits/rejected": -1.8788321018218994, "logps/chosen": -153.48651123046875, "logps/rejected": -56.51433563232422, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 7.1574249267578125, "rewards/margins": 4.494952201843262, "rewards/rejected": 2.6624724864959717, "step": 6146 }, { "epoch": 1.36, "learning_rate": 2.4499811899250194e-06, "logits/chosen": -1.8219572305679321, "logits/rejected": -1.8822641372680664, "logps/chosen": -55.759300231933594, "logps/rejected": -74.85594177246094, "loss": 2.1351, "rewards/accuracies": 0.0, "rewards/chosen": 4.78345251083374, "rewards/margins": -4.228515148162842, "rewards/rejected": 9.011967658996582, "step": 6147 }, { "epoch": 1.36, "learning_rate": 2.4484396430615086e-06, "logits/chosen": -1.8186074495315552, "logits/rejected": -1.7963465452194214, "logps/chosen": -65.73226165771484, "logps/rejected": -53.58953857421875, "loss": 0.2041, "rewards/accuracies": 1.0, "rewards/chosen": 3.9944190979003906, "rewards/margins": 1.2403388023376465, "rewards/rejected": 2.754080295562744, "step": 6148 }, { "epoch": 1.36, "learning_rate": 2.4468984240673718e-06, "logits/chosen": -1.711830496788025, "logits/rejected": -1.5242079496383667, "logps/chosen": -47.2205924987793, "logps/rejected": -60.414939880371094, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": 2.777193069458008, "rewards/margins": 2.6909937858581543, "rewards/rejected": 0.08619918674230576, "step": 6149 }, { "epoch": 1.36, "learning_rate": 2.445357533140646e-06, "logits/chosen": -2.0923378467559814, "logits/rejected": -2.1023097038269043, "logps/chosen": -49.68730163574219, "logps/rejected": -39.843505859375, "loss": 1.0502, "rewards/accuracies": 1.0, "rewards/chosen": 2.993975877761841, "rewards/margins": 0.031716108322143555, "rewards/rejected": 2.9622597694396973, "step": 6150 }, { "epoch": 1.36, "learning_rate": 2.4438169704793395e-06, "logits/chosen": -1.790927529335022, "logits/rejected": -1.811099886894226, "logps/chosen": -40.16324234008789, "logps/rejected": -89.68685150146484, "loss": 0.2499, "rewards/accuracies": 1.0, "rewards/chosen": 3.860081195831299, "rewards/margins": 0.45943498611450195, "rewards/rejected": 3.400646209716797, "step": 6151 }, { "epoch": 1.36, "learning_rate": 2.4422767362814045e-06, "logits/chosen": -1.9072641134262085, "logits/rejected": -1.9773436784744263, "logps/chosen": -67.45477294921875, "logps/rejected": -96.12867736816406, "loss": 0.5966, "rewards/accuracies": 0.0, "rewards/chosen": 7.228114604949951, "rewards/margins": -0.7122588157653809, "rewards/rejected": 7.940373420715332, "step": 6152 }, { "epoch": 1.36, "learning_rate": 2.4407368307447636e-06, "logits/chosen": -2.0215189456939697, "logits/rejected": -2.0215189456939697, "logps/chosen": -51.94029235839844, "logps/rejected": -51.94029235839844, "loss": 0.3733, "rewards/accuracies": 0.0, "rewards/chosen": 3.3601396083831787, "rewards/margins": 0.0, "rewards/rejected": 3.3601396083831787, "step": 6153 }, { "epoch": 1.36, "learning_rate": 2.439197254067286e-06, "logits/chosen": -2.1851518154144287, "logits/rejected": -2.1603856086730957, "logps/chosen": -63.929664611816406, "logps/rejected": -148.14508056640625, "loss": 0.249, "rewards/accuracies": 1.0, "rewards/chosen": 9.429952621459961, "rewards/margins": 0.5266990661621094, "rewards/rejected": 8.903253555297852, "step": 6154 }, { "epoch": 1.36, "learning_rate": 2.4376580064468047e-06, "logits/chosen": -1.7483549118041992, "logits/rejected": -1.7251394987106323, "logps/chosen": -26.666749954223633, "logps/rejected": -61.19410705566406, "loss": 1.154, "rewards/accuracies": 0.0, "rewards/chosen": 4.211373805999756, "rewards/margins": -2.0591564178466797, "rewards/rejected": 6.2705302238464355, "step": 6155 }, { "epoch": 1.36, "learning_rate": 2.4361190880811102e-06, "logits/chosen": -1.7339134216308594, "logits/rejected": -1.7339134216308594, "logps/chosen": -34.97933578491211, "logps/rejected": -34.97933578491211, "loss": 0.499, "rewards/accuracies": 0.0, "rewards/chosen": 2.1267807483673096, "rewards/margins": 0.0, "rewards/rejected": 2.1267807483673096, "step": 6156 }, { "epoch": 1.36, "learning_rate": 2.4345804991679493e-06, "logits/chosen": -2.1015655994415283, "logits/rejected": -2.052826404571533, "logps/chosen": -196.3782958984375, "logps/rejected": -90.86964416503906, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 8.767242431640625, "rewards/margins": 4.529758930206299, "rewards/rejected": 4.237483501434326, "step": 6157 }, { "epoch": 1.36, "learning_rate": 2.4330422399050296e-06, "logits/chosen": -2.049619674682617, "logits/rejected": -1.9376999139785767, "logps/chosen": -50.56103515625, "logps/rejected": -9.555334091186523, "loss": 0.2635, "rewards/accuracies": 1.0, "rewards/chosen": 3.6187188625335693, "rewards/margins": 3.026186227798462, "rewards/rejected": 0.5925325751304626, "step": 6158 }, { "epoch": 1.36, "learning_rate": 2.431504310490007e-06, "logits/chosen": -2.173292398452759, "logits/rejected": -2.187617301940918, "logps/chosen": -68.8870849609375, "logps/rejected": -66.14056396484375, "loss": 0.2789, "rewards/accuracies": 1.0, "rewards/chosen": 4.680062770843506, "rewards/margins": 0.30741405487060547, "rewards/rejected": 4.3726487159729, "step": 6159 }, { "epoch": 1.36, "learning_rate": 2.42996671112051e-06, "logits/chosen": -1.897668480873108, "logits/rejected": -1.8138153553009033, "logps/chosen": -76.16159057617188, "logps/rejected": -36.294944763183594, "loss": 0.0735, "rewards/accuracies": 1.0, "rewards/chosen": 5.364293098449707, "rewards/margins": 3.0355515480041504, "rewards/rejected": 2.3287415504455566, "step": 6160 }, { "epoch": 1.36, "learning_rate": 2.4284294419941087e-06, "logits/chosen": -2.0149362087249756, "logits/rejected": -2.0149362087249756, "logps/chosen": -52.1630744934082, "logps/rejected": -52.1630744934082, "loss": 0.402, "rewards/accuracies": 0.0, "rewards/chosen": 6.82183837890625, "rewards/margins": 0.0, "rewards/rejected": 6.82183837890625, "step": 6161 }, { "epoch": 1.36, "learning_rate": 2.426892503308346e-06, "logits/chosen": -1.6610825061798096, "logits/rejected": -1.5843786001205444, "logps/chosen": -64.47004699707031, "logps/rejected": -61.63671112060547, "loss": 0.7095, "rewards/accuracies": 0.0, "rewards/chosen": 3.8284804821014404, "rewards/margins": -0.8783853054046631, "rewards/rejected": 4.7068657875061035, "step": 6162 }, { "epoch": 1.36, "learning_rate": 2.4253558952607086e-06, "logits/chosen": -1.7730140686035156, "logits/rejected": -1.7049400806427002, "logps/chosen": -45.72137451171875, "logps/rejected": -9.8176851272583, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": 4.778420925140381, "rewards/margins": 4.1433281898498535, "rewards/rejected": 0.6350928544998169, "step": 6163 }, { "epoch": 1.36, "learning_rate": 2.42381961804865e-06, "logits/chosen": -1.8833742141723633, "logits/rejected": -1.8833742141723633, "logps/chosen": -36.04310607910156, "logps/rejected": -36.04310607910156, "loss": 0.3531, "rewards/accuracies": 0.0, "rewards/chosen": 1.6069717407226562, "rewards/margins": 0.0, "rewards/rejected": 1.6069717407226562, "step": 6164 }, { "epoch": 1.36, "learning_rate": 2.422283671869577e-06, "logits/chosen": -2.159825086593628, "logits/rejected": -2.0898919105529785, "logps/chosen": -72.72933959960938, "logps/rejected": -35.748294830322266, "loss": 0.0844, "rewards/accuracies": 1.0, "rewards/chosen": 2.610799551010132, "rewards/margins": 2.21397066116333, "rewards/rejected": 0.39682886004447937, "step": 6165 }, { "epoch": 1.36, "learning_rate": 2.420748056920856e-06, "logits/chosen": -1.7425990104675293, "logits/rejected": -1.7724361419677734, "logps/chosen": -27.934345245361328, "logps/rejected": -54.92076873779297, "loss": 0.9047, "rewards/accuracies": 0.0, "rewards/chosen": 4.054123401641846, "rewards/margins": -0.2329702377319336, "rewards/rejected": 4.287093639373779, "step": 6166 }, { "epoch": 1.36, "learning_rate": 2.419212773399811e-06, "logits/chosen": -2.0444138050079346, "logits/rejected": -2.017430543899536, "logps/chosen": -72.79428100585938, "logps/rejected": -125.48675537109375, "loss": 0.1284, "rewards/accuracies": 1.0, "rewards/chosen": 8.55799388885498, "rewards/margins": 1.3804426193237305, "rewards/rejected": 7.17755126953125, "step": 6167 }, { "epoch": 1.37, "learning_rate": 2.4176778215037166e-06, "logits/chosen": -1.9408286809921265, "logits/rejected": -1.9118568897247314, "logps/chosen": -82.47409057617188, "logps/rejected": -61.48000717163086, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": 6.319161891937256, "rewards/margins": 3.1425960063934326, "rewards/rejected": 3.1765658855438232, "step": 6168 }, { "epoch": 1.37, "learning_rate": 2.4161432014298188e-06, "logits/chosen": -1.988675832748413, "logits/rejected": -1.7200206518173218, "logps/chosen": -55.32182312011719, "logps/rejected": -113.22258758544922, "loss": 0.5472, "rewards/accuracies": 0.0, "rewards/chosen": 5.116481304168701, "rewards/margins": -0.6007499694824219, "rewards/rejected": 5.717231273651123, "step": 6169 }, { "epoch": 1.37, "learning_rate": 2.4146089133753035e-06, "logits/chosen": -1.8172303438186646, "logits/rejected": -1.8239210844039917, "logps/chosen": -128.6600799560547, "logps/rejected": -40.33833312988281, "loss": 0.0447, "rewards/accuracies": 1.0, "rewards/chosen": 5.2450456619262695, "rewards/margins": 3.9105300903320312, "rewards/rejected": 1.3345154523849487, "step": 6170 }, { "epoch": 1.37, "learning_rate": 2.4130749575373326e-06, "logits/chosen": -1.9499244689941406, "logits/rejected": -1.9316456317901611, "logps/chosen": -128.35238647460938, "logps/rejected": -69.13240814208984, "loss": 0.062, "rewards/accuracies": 1.0, "rewards/chosen": 7.788710117340088, "rewards/margins": 3.3391332626342773, "rewards/rejected": 4.4495768547058105, "step": 6171 }, { "epoch": 1.37, "learning_rate": 2.4115413341130085e-06, "logits/chosen": -2.1193225383758545, "logits/rejected": -2.1321029663085938, "logps/chosen": -38.832637786865234, "logps/rejected": -60.16986846923828, "loss": 0.9956, "rewards/accuracies": 0.0, "rewards/chosen": 3.8753583431243896, "rewards/margins": -1.6290605068206787, "rewards/rejected": 5.504418849945068, "step": 6172 }, { "epoch": 1.37, "learning_rate": 2.4100080432994e-06, "logits/chosen": -1.565425992012024, "logits/rejected": -1.6418712139129639, "logps/chosen": -21.969364166259766, "logps/rejected": -62.721351623535156, "loss": 1.2841, "rewards/accuracies": 0.0, "rewards/chosen": 0.7324215173721313, "rewards/margins": -2.2121191024780273, "rewards/rejected": 2.944540500640869, "step": 6173 }, { "epoch": 1.37, "learning_rate": 2.4084750852935325e-06, "logits/chosen": -1.7824023962020874, "logits/rejected": -1.6958109140396118, "logps/chosen": -102.78108978271484, "logps/rejected": -79.26736450195312, "loss": 0.0692, "rewards/accuracies": 1.0, "rewards/chosen": 6.272346019744873, "rewards/margins": 1.91510009765625, "rewards/rejected": 4.357245922088623, "step": 6174 }, { "epoch": 1.37, "learning_rate": 2.4069424602923865e-06, "logits/chosen": -1.7262675762176514, "logits/rejected": -1.7060065269470215, "logps/chosen": -94.96060943603516, "logps/rejected": -82.76152038574219, "loss": 0.1043, "rewards/accuracies": 1.0, "rewards/chosen": 7.871132850646973, "rewards/margins": 1.8573999404907227, "rewards/rejected": 6.01373291015625, "step": 6175 }, { "epoch": 1.37, "learning_rate": 2.405410168492902e-06, "logits/chosen": -1.9102933406829834, "logits/rejected": -1.9102933406829834, "logps/chosen": -45.31231689453125, "logps/rejected": -45.31231689453125, "loss": 0.3611, "rewards/accuracies": 0.0, "rewards/chosen": 5.438407897949219, "rewards/margins": 0.0, "rewards/rejected": 5.438407897949219, "step": 6176 }, { "epoch": 1.37, "learning_rate": 2.4038782100919703e-06, "logits/chosen": -1.695476770401001, "logits/rejected": -1.6010586023330688, "logps/chosen": -99.10612487792969, "logps/rejected": -67.84169006347656, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": 6.209141731262207, "rewards/margins": 3.924920082092285, "rewards/rejected": 2.284221649169922, "step": 6177 }, { "epoch": 1.37, "learning_rate": 2.402346585286452e-06, "logits/chosen": -2.0146172046661377, "logits/rejected": -1.964394450187683, "logps/chosen": -132.77500915527344, "logps/rejected": -75.0822525024414, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 6.335791110992432, "rewards/margins": 3.4928925037384033, "rewards/rejected": 2.8428986072540283, "step": 6178 }, { "epoch": 1.37, "learning_rate": 2.400815294273149e-06, "logits/chosen": -2.110619306564331, "logits/rejected": -2.0481090545654297, "logps/chosen": -146.03514099121094, "logps/rejected": -33.64949417114258, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 6.783891201019287, "rewards/margins": 6.273156642913818, "rewards/rejected": 0.5107345581054688, "step": 6179 }, { "epoch": 1.37, "learning_rate": 2.3992843372488357e-06, "logits/chosen": -1.9215701818466187, "logits/rejected": -1.8613255023956299, "logps/chosen": -72.7161865234375, "logps/rejected": -54.933815002441406, "loss": 0.169, "rewards/accuracies": 1.0, "rewards/chosen": 4.0995330810546875, "rewards/margins": 0.929070234298706, "rewards/rejected": 3.1704628467559814, "step": 6180 }, { "epoch": 1.37, "learning_rate": 2.3977537144102314e-06, "logits/chosen": -2.0293073654174805, "logits/rejected": -2.0019397735595703, "logps/chosen": -40.82551574707031, "logps/rejected": -51.304439544677734, "loss": 1.7454, "rewards/accuracies": 0.0, "rewards/chosen": 3.8566513061523438, "rewards/margins": -0.21956205368041992, "rewards/rejected": 4.076213359832764, "step": 6181 }, { "epoch": 1.37, "learning_rate": 2.396223425954019e-06, "logits/chosen": -1.7896385192871094, "logits/rejected": -1.795235514640808, "logps/chosen": -17.384143829345703, "logps/rejected": -33.317771911621094, "loss": 0.6418, "rewards/accuracies": 0.0, "rewards/chosen": 3.129337787628174, "rewards/margins": -0.011394023895263672, "rewards/rejected": 3.1407318115234375, "step": 6182 }, { "epoch": 1.37, "learning_rate": 2.394693472076837e-06, "logits/chosen": -1.6872551441192627, "logits/rejected": -1.7519229650497437, "logps/chosen": -37.440452575683594, "logps/rejected": -115.64683532714844, "loss": 0.1743, "rewards/accuracies": 1.0, "rewards/chosen": 5.0450286865234375, "rewards/margins": 0.8872056007385254, "rewards/rejected": 4.157823085784912, "step": 6183 }, { "epoch": 1.37, "learning_rate": 2.3931638529752813e-06, "logits/chosen": -2.1165988445281982, "logits/rejected": -1.8923907279968262, "logps/chosen": -101.64161682128906, "logps/rejected": -113.98600769042969, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": 8.667311668395996, "rewards/margins": 3.4195971488952637, "rewards/rejected": 5.247714519500732, "step": 6184 }, { "epoch": 1.37, "learning_rate": 2.3916345688459057e-06, "logits/chosen": -1.8527686595916748, "logits/rejected": -1.7940179109573364, "logps/chosen": -44.25286102294922, "logps/rejected": -32.31867218017578, "loss": 0.1889, "rewards/accuracies": 1.0, "rewards/chosen": 5.4446234703063965, "rewards/margins": 1.032179832458496, "rewards/rejected": 4.4124436378479, "step": 6185 }, { "epoch": 1.37, "learning_rate": 2.390105619885214e-06, "logits/chosen": -1.8385157585144043, "logits/rejected": -1.3002976179122925, "logps/chosen": -96.0647964477539, "logps/rejected": -74.78956604003906, "loss": 0.0742, "rewards/accuracies": 1.0, "rewards/chosen": 8.619898796081543, "rewards/margins": 2.2163052558898926, "rewards/rejected": 6.40359354019165, "step": 6186 }, { "epoch": 1.37, "learning_rate": 2.3885770062896795e-06, "logits/chosen": -2.2223074436187744, "logits/rejected": -1.5943866968154907, "logps/chosen": -149.19522094726562, "logps/rejected": -49.085289001464844, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": 7.203985691070557, "rewards/margins": 3.9385552406311035, "rewards/rejected": 3.265430450439453, "step": 6187 }, { "epoch": 1.37, "learning_rate": 2.387048728255718e-06, "logits/chosen": -1.9897303581237793, "logits/rejected": -1.9139515161514282, "logps/chosen": -77.66181945800781, "logps/rejected": -36.59962463378906, "loss": 1.1138, "rewards/accuracies": 1.0, "rewards/chosen": 3.6069374084472656, "rewards/margins": 0.8260383605957031, "rewards/rejected": 2.7808990478515625, "step": 6188 }, { "epoch": 1.37, "learning_rate": 2.385520785979718e-06, "logits/chosen": -1.7298774719238281, "logits/rejected": -1.6643962860107422, "logps/chosen": -140.17013549804688, "logps/rejected": -87.41526794433594, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 6.280513286590576, "rewards/margins": 2.4141223430633545, "rewards/rejected": 3.8663909435272217, "step": 6189 }, { "epoch": 1.37, "learning_rate": 2.3839931796580094e-06, "logits/chosen": -2.0421605110168457, "logits/rejected": -1.954513669013977, "logps/chosen": -70.55756378173828, "logps/rejected": -10.52575397491455, "loss": 0.2467, "rewards/accuracies": 1.0, "rewards/chosen": 3.5451576709747314, "rewards/margins": 0.6102268695831299, "rewards/rejected": 2.9349308013916016, "step": 6190 }, { "epoch": 1.37, "learning_rate": 2.3824659094868897e-06, "logits/chosen": -1.609688639640808, "logits/rejected": -1.6330233812332153, "logps/chosen": -60.83198547363281, "logps/rejected": -110.15718841552734, "loss": 0.2586, "rewards/accuracies": 1.0, "rewards/chosen": 8.414095878601074, "rewards/margins": 0.8155736923217773, "rewards/rejected": 7.598522186279297, "step": 6191 }, { "epoch": 1.37, "learning_rate": 2.380938975662607e-06, "logits/chosen": -1.9894262552261353, "logits/rejected": -1.940919280052185, "logps/chosen": -40.05612564086914, "logps/rejected": -44.56171798706055, "loss": 0.1409, "rewards/accuracies": 1.0, "rewards/chosen": 3.307708501815796, "rewards/margins": 1.3493279218673706, "rewards/rejected": 1.9583805799484253, "step": 6192 }, { "epoch": 1.37, "learning_rate": 2.3794123783813705e-06, "logits/chosen": -1.9531630277633667, "logits/rejected": -2.038128614425659, "logps/chosen": -30.498315811157227, "logps/rejected": -149.27552795410156, "loss": 2.7922, "rewards/accuracies": 0.0, "rewards/chosen": 4.5124688148498535, "rewards/margins": -5.3106207847595215, "rewards/rejected": 9.823089599609375, "step": 6193 }, { "epoch": 1.37, "learning_rate": 2.3778861178393453e-06, "logits/chosen": -1.8027479648590088, "logits/rejected": -1.678371787071228, "logps/chosen": -39.93213653564453, "logps/rejected": -11.51660442352295, "loss": 0.3899, "rewards/accuracies": 1.0, "rewards/chosen": 3.018923282623291, "rewards/margins": 2.401035785675049, "rewards/rejected": 0.6178876161575317, "step": 6194 }, { "epoch": 1.37, "learning_rate": 2.376360194232647e-06, "logits/chosen": -2.198049545288086, "logits/rejected": -2.162330389022827, "logps/chosen": -54.20819091796875, "logps/rejected": -22.345321655273438, "loss": 0.1991, "rewards/accuracies": 1.0, "rewards/chosen": 2.7407166957855225, "rewards/margins": 0.9442566633224487, "rewards/rejected": 1.7964600324630737, "step": 6195 }, { "epoch": 1.37, "learning_rate": 2.3748346077573605e-06, "logits/chosen": -1.92537260055542, "logits/rejected": -1.8787144422531128, "logps/chosen": -51.55678176879883, "logps/rejected": -31.957035064697266, "loss": 0.178, "rewards/accuracies": 1.0, "rewards/chosen": 3.3375279903411865, "rewards/margins": 1.0161621570587158, "rewards/rejected": 2.3213658332824707, "step": 6196 }, { "epoch": 1.37, "learning_rate": 2.373309358609512e-06, "logits/chosen": -1.826090931892395, "logits/rejected": -1.711949348449707, "logps/chosen": -113.8713150024414, "logps/rejected": -37.61316680908203, "loss": 0.0799, "rewards/accuracies": 1.0, "rewards/chosen": 6.007283687591553, "rewards/margins": 2.912651777267456, "rewards/rejected": 3.0946319103240967, "step": 6197 }, { "epoch": 1.37, "learning_rate": 2.3717844469851e-06, "logits/chosen": -2.0964434146881104, "logits/rejected": -1.634207010269165, "logps/chosen": -54.443267822265625, "logps/rejected": -89.92088317871094, "loss": 0.4553, "rewards/accuracies": 0.0, "rewards/chosen": 6.4423394203186035, "rewards/margins": -0.39467525482177734, "rewards/rejected": 6.837014675140381, "step": 6198 }, { "epoch": 1.37, "learning_rate": 2.3702598730800675e-06, "logits/chosen": -1.9501595497131348, "logits/rejected": -1.8376818895339966, "logps/chosen": -93.27499389648438, "logps/rejected": -162.1531219482422, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 7.81666898727417, "rewards/margins": 1.4784502983093262, "rewards/rejected": 6.338218688964844, "step": 6199 }, { "epoch": 1.37, "learning_rate": 2.368735637090319e-06, "logits/chosen": -2.1274795532226562, "logits/rejected": -2.1225953102111816, "logps/chosen": -96.11936950683594, "logps/rejected": -183.88255310058594, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": 12.691368103027344, "rewards/margins": 3.4822845458984375, "rewards/rejected": 9.209083557128906, "step": 6200 }, { "epoch": 1.37, "learning_rate": 2.367211739211716e-06, "logits/chosen": -2.091512441635132, "logits/rejected": -1.919443964958191, "logps/chosen": -162.4462432861328, "logps/rejected": -30.854005813598633, "loss": 0.348, "rewards/accuracies": 1.0, "rewards/chosen": 6.591658115386963, "rewards/margins": 5.839977741241455, "rewards/rejected": 0.7516801953315735, "step": 6201 }, { "epoch": 1.37, "learning_rate": 2.365688179640076e-06, "logits/chosen": -1.6911580562591553, "logits/rejected": -1.6911580562591553, "logps/chosen": -60.01081085205078, "logps/rejected": -60.01081085205078, "loss": 0.4547, "rewards/accuracies": 0.0, "rewards/chosen": 2.8465561866760254, "rewards/margins": 0.0, "rewards/rejected": 2.8465561866760254, "step": 6202 }, { "epoch": 1.37, "learning_rate": 2.3641649585711733e-06, "logits/chosen": -1.988393783569336, "logits/rejected": -1.9895963668823242, "logps/chosen": -72.24246215820312, "logps/rejected": -65.67066192626953, "loss": 1.0906, "rewards/accuracies": 1.0, "rewards/chosen": 5.26260232925415, "rewards/margins": 1.2964119911193848, "rewards/rejected": 3.9661903381347656, "step": 6203 }, { "epoch": 1.37, "learning_rate": 2.3626420762007335e-06, "logits/chosen": -2.071819543838501, "logits/rejected": -2.085397958755493, "logps/chosen": -15.895768165588379, "logps/rejected": -128.41461181640625, "loss": 4.0258, "rewards/accuracies": 0.0, "rewards/chosen": 3.4598705768585205, "rewards/margins": -3.5182836055755615, "rewards/rejected": 6.978154182434082, "step": 6204 }, { "epoch": 1.37, "learning_rate": 2.3611195327244506e-06, "logits/chosen": -1.7713091373443604, "logits/rejected": -1.7060285806655884, "logps/chosen": -90.53474426269531, "logps/rejected": -129.0057830810547, "loss": 0.296, "rewards/accuracies": 1.0, "rewards/chosen": 7.197232246398926, "rewards/margins": 0.40974903106689453, "rewards/rejected": 6.787483215332031, "step": 6205 }, { "epoch": 1.37, "learning_rate": 2.35959732833796e-06, "logits/chosen": -1.9240705966949463, "logits/rejected": -1.8592686653137207, "logps/chosen": -94.76182556152344, "logps/rejected": -75.89351654052734, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 8.23785400390625, "rewards/margins": 3.3355917930603027, "rewards/rejected": 4.902262210845947, "step": 6206 }, { "epoch": 1.37, "learning_rate": 2.35807546323687e-06, "logits/chosen": -1.9940942525863647, "logits/rejected": -1.7877475023269653, "logps/chosen": -148.6962890625, "logps/rejected": -55.88114929199219, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 7.385341167449951, "rewards/margins": 5.562387466430664, "rewards/rejected": 1.8229538202285767, "step": 6207 }, { "epoch": 1.37, "learning_rate": 2.3565539376167295e-06, "logits/chosen": -1.888006567955017, "logits/rejected": -1.8646920919418335, "logps/chosen": -46.73058319091797, "logps/rejected": -33.194881439208984, "loss": 0.4046, "rewards/accuracies": 0.0, "rewards/chosen": 4.276966094970703, "rewards/margins": -0.10499238967895508, "rewards/rejected": 4.381958484649658, "step": 6208 }, { "epoch": 1.37, "learning_rate": 2.355032751673054e-06, "logits/chosen": -2.102769613265991, "logits/rejected": -2.0697879791259766, "logps/chosen": -44.85788345336914, "logps/rejected": -51.9771728515625, "loss": 0.2123, "rewards/accuracies": 1.0, "rewards/chosen": 3.9520986080169678, "rewards/margins": 1.1264591217041016, "rewards/rejected": 2.825639486312866, "step": 6209 }, { "epoch": 1.37, "learning_rate": 2.3535119056013107e-06, "logits/chosen": -1.7985179424285889, "logits/rejected": -1.7952204942703247, "logps/chosen": -56.744468688964844, "logps/rejected": -75.10120391845703, "loss": 0.1939, "rewards/accuracies": 1.0, "rewards/chosen": 3.5706779956817627, "rewards/margins": 0.9094099998474121, "rewards/rejected": 2.6612679958343506, "step": 6210 }, { "epoch": 1.37, "learning_rate": 2.3519913995969256e-06, "logits/chosen": -2.179389238357544, "logits/rejected": -1.9667961597442627, "logps/chosen": -83.79872131347656, "logps/rejected": -25.488569259643555, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": 9.34862232208252, "rewards/margins": 8.720491409301758, "rewards/rejected": 0.628131091594696, "step": 6211 }, { "epoch": 1.37, "learning_rate": 2.3504712338552804e-06, "logits/chosen": -2.016541004180908, "logits/rejected": -1.7852964401245117, "logps/chosen": -34.4642219543457, "logps/rejected": -203.1396484375, "loss": 85.5697, "rewards/accuracies": 0.0, "rewards/chosen": 6.366012096405029, "rewards/margins": -167.9757843017578, "rewards/rejected": 174.341796875, "step": 6212 }, { "epoch": 1.38, "learning_rate": 2.348951408571712e-06, "logits/chosen": -1.7387582063674927, "logits/rejected": -1.7208091020584106, "logps/chosen": -36.20886993408203, "logps/rejected": -99.87995910644531, "loss": 0.0998, "rewards/accuracies": 1.0, "rewards/chosen": 3.637026309967041, "rewards/margins": 3.1444344520568848, "rewards/rejected": 0.49259185791015625, "step": 6213 }, { "epoch": 1.38, "learning_rate": 2.3474319239415165e-06, "logits/chosen": -2.0453274250030518, "logits/rejected": -2.001492500305176, "logps/chosen": -109.97216796875, "logps/rejected": -186.62477111816406, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": 11.559592247009277, "rewards/margins": 2.2801132202148438, "rewards/rejected": 9.279479026794434, "step": 6214 }, { "epoch": 1.38, "learning_rate": 2.345912780159939e-06, "logits/chosen": -1.6039137840270996, "logits/rejected": -1.540449857711792, "logps/chosen": -49.64045715332031, "logps/rejected": -44.83565902709961, "loss": 0.2228, "rewards/accuracies": 1.0, "rewards/chosen": 3.9338746070861816, "rewards/margins": 1.1424694061279297, "rewards/rejected": 2.791405200958252, "step": 6215 }, { "epoch": 1.38, "learning_rate": 2.3443939774221926e-06, "logits/chosen": -2.0125112533569336, "logits/rejected": -1.9985569715499878, "logps/chosen": -70.11219787597656, "logps/rejected": -65.95022583007812, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": 4.3426833152771, "rewards/margins": 1.6403167247772217, "rewards/rejected": 2.702366590499878, "step": 6216 }, { "epoch": 1.38, "learning_rate": 2.342875515923434e-06, "logits/chosen": -2.176168918609619, "logits/rejected": -2.1714556217193604, "logps/chosen": -49.779396057128906, "logps/rejected": -54.0651741027832, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": 4.337314605712891, "rewards/margins": 2.8440799713134766, "rewards/rejected": 1.493234634399414, "step": 6217 }, { "epoch": 1.38, "learning_rate": 2.341357395858784e-06, "logits/chosen": -2.2829906940460205, "logits/rejected": -2.3081166744232178, "logps/chosen": -71.82069396972656, "logps/rejected": -90.74687194824219, "loss": 0.6863, "rewards/accuracies": 0.0, "rewards/chosen": 8.560416221618652, "rewards/margins": -1.012803077697754, "rewards/rejected": 9.573219299316406, "step": 6218 }, { "epoch": 1.38, "learning_rate": 2.339839617423318e-06, "logits/chosen": -1.9501785039901733, "logits/rejected": -1.8705658912658691, "logps/chosen": -101.45352935791016, "logps/rejected": -43.72626495361328, "loss": 0.0769, "rewards/accuracies": 1.0, "rewards/chosen": 4.30034875869751, "rewards/margins": 2.0567612648010254, "rewards/rejected": 2.2435874938964844, "step": 6219 }, { "epoch": 1.38, "learning_rate": 2.3383221808120654e-06, "logits/chosen": -1.996069312095642, "logits/rejected": -1.9375410079956055, "logps/chosen": -70.74510955810547, "logps/rejected": -69.29721069335938, "loss": 1.3173, "rewards/accuracies": 1.0, "rewards/chosen": 7.3285746574401855, "rewards/margins": 4.242037773132324, "rewards/rejected": 3.0865371227264404, "step": 6220 }, { "epoch": 1.38, "learning_rate": 2.336805086220015e-06, "logits/chosen": -2.2221274375915527, "logits/rejected": -2.233123779296875, "logps/chosen": -31.84888458251953, "logps/rejected": -29.94916534423828, "loss": 0.7093, "rewards/accuracies": 0.0, "rewards/chosen": 3.931394338607788, "rewards/margins": -1.0870888233184814, "rewards/rejected": 5.0184831619262695, "step": 6221 }, { "epoch": 1.38, "learning_rate": 2.3352883338421085e-06, "logits/chosen": -2.3089401721954346, "logits/rejected": -2.2411386966705322, "logps/chosen": -152.93753051757812, "logps/rejected": -25.918527603149414, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 6.714613437652588, "rewards/margins": 4.631283760070801, "rewards/rejected": 2.083329916000366, "step": 6222 }, { "epoch": 1.38, "learning_rate": 2.3337719238732456e-06, "logits/chosen": -2.0135960578918457, "logits/rejected": -1.885528326034546, "logps/chosen": -128.820068359375, "logps/rejected": -65.39193725585938, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 7.719714641571045, "rewards/margins": 5.20143461227417, "rewards/rejected": 2.518280029296875, "step": 6223 }, { "epoch": 1.38, "learning_rate": 2.332255856508282e-06, "logits/chosen": -2.1686348915100098, "logits/rejected": -2.1686348915100098, "logps/chosen": -42.60481262207031, "logps/rejected": -42.60481262207031, "loss": 0.3469, "rewards/accuracies": 0.0, "rewards/chosen": 7.202278137207031, "rewards/margins": 0.0, "rewards/rejected": 7.202278137207031, "step": 6224 }, { "epoch": 1.38, "learning_rate": 2.330740131942027e-06, "logits/chosen": -1.7761669158935547, "logits/rejected": -1.7463300228118896, "logps/chosen": -32.098052978515625, "logps/rejected": -63.32133102416992, "loss": 0.1865, "rewards/accuracies": 1.0, "rewards/chosen": 4.3226318359375, "rewards/margins": 0.9526331424713135, "rewards/rejected": 3.3699986934661865, "step": 6225 }, { "epoch": 1.38, "learning_rate": 2.3292247503692478e-06, "logits/chosen": -2.0364770889282227, "logits/rejected": -1.9963428974151611, "logps/chosen": -39.55066680908203, "logps/rejected": -34.395164489746094, "loss": 0.1633, "rewards/accuracies": 1.0, "rewards/chosen": 3.114501953125, "rewards/margins": 0.9568915367126465, "rewards/rejected": 2.1576104164123535, "step": 6226 }, { "epoch": 1.38, "learning_rate": 2.3277097119846677e-06, "logits/chosen": -1.793334722518921, "logits/rejected": -1.8898941278457642, "logps/chosen": -43.725868225097656, "logps/rejected": -107.19477081298828, "loss": 0.9737, "rewards/accuracies": 0.0, "rewards/chosen": 6.91211462020874, "rewards/margins": -0.2976870536804199, "rewards/rejected": 7.20980167388916, "step": 6227 }, { "epoch": 1.38, "learning_rate": 2.3261950169829656e-06, "logits/chosen": -1.8612680435180664, "logits/rejected": -1.4107505083084106, "logps/chosen": -38.825069427490234, "logps/rejected": -74.91598510742188, "loss": 0.2413, "rewards/accuracies": 1.0, "rewards/chosen": 2.971277952194214, "rewards/margins": 0.5126552581787109, "rewards/rejected": 2.458622694015503, "step": 6228 }, { "epoch": 1.38, "learning_rate": 2.3246806655587762e-06, "logits/chosen": -1.9118887186050415, "logits/rejected": -1.8838722705841064, "logps/chosen": -88.69679260253906, "logps/rejected": -63.941001892089844, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 8.055198669433594, "rewards/margins": 2.1663260459899902, "rewards/rejected": 5.8888726234436035, "step": 6229 }, { "epoch": 1.38, "learning_rate": 2.3231666579066904e-06, "logits/chosen": -1.9706952571868896, "logits/rejected": -2.022993564605713, "logps/chosen": -86.77053833007812, "logps/rejected": -159.4769744873047, "loss": 0.8446, "rewards/accuracies": 0.0, "rewards/chosen": 9.207696914672852, "rewards/margins": -1.3790416717529297, "rewards/rejected": 10.586738586425781, "step": 6230 }, { "epoch": 1.38, "learning_rate": 2.3216529942212536e-06, "logits/chosen": -1.8110911846160889, "logits/rejected": -1.724979281425476, "logps/chosen": -49.04955291748047, "logps/rejected": -10.935709953308105, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 2.2865662574768066, "rewards/margins": 1.408562183380127, "rewards/rejected": 0.8780040144920349, "step": 6231 }, { "epoch": 1.38, "learning_rate": 2.320139674696969e-06, "logits/chosen": -2.0445032119750977, "logits/rejected": -1.9620367288589478, "logps/chosen": -44.48062515258789, "logps/rejected": -24.34766387939453, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": 3.2986667156219482, "rewards/margins": 2.9849390983581543, "rewards/rejected": 0.31372758746147156, "step": 6232 }, { "epoch": 1.38, "learning_rate": 2.3186266995282954e-06, "logits/chosen": -1.8279173374176025, "logits/rejected": -1.8101484775543213, "logps/chosen": -38.806175231933594, "logps/rejected": -65.82083892822266, "loss": 0.2544, "rewards/accuracies": 1.0, "rewards/chosen": 4.577822208404541, "rewards/margins": 0.9620378017425537, "rewards/rejected": 3.6157844066619873, "step": 6233 }, { "epoch": 1.38, "learning_rate": 2.317114068909643e-06, "logits/chosen": -1.8864175081253052, "logits/rejected": -1.8717623949050903, "logps/chosen": -91.8705825805664, "logps/rejected": -101.82029724121094, "loss": 0.1272, "rewards/accuracies": 1.0, "rewards/chosen": 7.206745147705078, "rewards/margins": 1.2970190048217773, "rewards/rejected": 5.909726142883301, "step": 6234 }, { "epoch": 1.38, "learning_rate": 2.3156017830353837e-06, "logits/chosen": -1.6858903169631958, "logits/rejected": -1.6858903169631958, "logps/chosen": -33.675575256347656, "logps/rejected": -33.675575256347656, "loss": 0.3699, "rewards/accuracies": 0.0, "rewards/chosen": 4.318772315979004, "rewards/margins": 0.0, "rewards/rejected": 4.318772315979004, "step": 6235 }, { "epoch": 1.38, "learning_rate": 2.3140898420998425e-06, "logits/chosen": -1.9539060592651367, "logits/rejected": -1.7521954774856567, "logps/chosen": -29.90917205810547, "logps/rejected": -151.1173095703125, "loss": 1.2907, "rewards/accuracies": 0.0, "rewards/chosen": 3.6851189136505127, "rewards/margins": -2.493692636489868, "rewards/rejected": 6.178811550140381, "step": 6236 }, { "epoch": 1.38, "learning_rate": 2.312578246297299e-06, "logits/chosen": -1.7107021808624268, "logits/rejected": -1.6766281127929688, "logps/chosen": -55.53132247924805, "logps/rejected": -39.550697326660156, "loss": 0.8201, "rewards/accuracies": 0.0, "rewards/chosen": 3.2502224445343018, "rewards/margins": -1.3807399272918701, "rewards/rejected": 4.630962371826172, "step": 6237 }, { "epoch": 1.38, "learning_rate": 2.3110669958219913e-06, "logits/chosen": -2.2109556198120117, "logits/rejected": -2.2098660469055176, "logps/chosen": -37.51758575439453, "logps/rejected": -77.69540405273438, "loss": 0.728, "rewards/accuracies": 0.0, "rewards/chosen": 4.068824768066406, "rewards/margins": -0.5868945121765137, "rewards/rejected": 4.65571928024292, "step": 6238 }, { "epoch": 1.38, "learning_rate": 2.30955609086811e-06, "logits/chosen": -1.6019994020462036, "logits/rejected": -1.616356372833252, "logps/chosen": -54.54759216308594, "logps/rejected": -51.79273223876953, "loss": 0.2775, "rewards/accuracies": 1.0, "rewards/chosen": 4.080735683441162, "rewards/margins": 0.3622419834136963, "rewards/rejected": 3.718493700027466, "step": 6239 }, { "epoch": 1.38, "learning_rate": 2.308045531629804e-06, "logits/chosen": -1.9782663583755493, "logits/rejected": -1.9610965251922607, "logps/chosen": -50.51886749267578, "logps/rejected": -70.10284423828125, "loss": 0.4393, "rewards/accuracies": 0.0, "rewards/chosen": 4.137081146240234, "rewards/margins": -0.2812166213989258, "rewards/rejected": 4.41829776763916, "step": 6240 }, { "epoch": 1.38, "learning_rate": 2.306535318301175e-06, "logits/chosen": -1.7737278938293457, "logits/rejected": -1.782405972480774, "logps/chosen": -24.699748992919922, "logps/rejected": -51.50107192993164, "loss": 1.3339, "rewards/accuracies": 0.0, "rewards/chosen": 4.174557209014893, "rewards/margins": -2.555119514465332, "rewards/rejected": 6.729676723480225, "step": 6241 }, { "epoch": 1.38, "learning_rate": 2.3050254510762855e-06, "logits/chosen": -1.6736195087432861, "logits/rejected": -1.2268807888031006, "logps/chosen": -99.8449935913086, "logps/rejected": -110.95413970947266, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 7.201975345611572, "rewards/margins": 3.8264832496643066, "rewards/rejected": 3.3754920959472656, "step": 6242 }, { "epoch": 1.38, "learning_rate": 2.3035159301491426e-06, "logits/chosen": -2.0103912353515625, "logits/rejected": -2.0806515216827393, "logps/chosen": -44.25740051269531, "logps/rejected": -74.17591094970703, "loss": 0.5265, "rewards/accuracies": 0.0, "rewards/chosen": 5.720761299133301, "rewards/margins": -0.5850653648376465, "rewards/rejected": 6.305826663970947, "step": 6243 }, { "epoch": 1.38, "learning_rate": 2.302006755713724e-06, "logits/chosen": -2.0459790229797363, "logits/rejected": -2.030400037765503, "logps/chosen": -55.77763748168945, "logps/rejected": -90.44013977050781, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": 5.121245384216309, "rewards/margins": 1.8129734992980957, "rewards/rejected": 3.308271884918213, "step": 6244 }, { "epoch": 1.38, "learning_rate": 2.3004979279639506e-06, "logits/chosen": -1.7813702821731567, "logits/rejected": -1.7464340925216675, "logps/chosen": -24.74312400817871, "logps/rejected": -40.098182678222656, "loss": 0.168, "rewards/accuracies": 1.0, "rewards/chosen": 2.857060432434082, "rewards/margins": 0.9358736276626587, "rewards/rejected": 1.9211868047714233, "step": 6245 }, { "epoch": 1.38, "learning_rate": 2.2989894470937037e-06, "logits/chosen": -1.752054214477539, "logits/rejected": -1.7487345933914185, "logps/chosen": -35.82534408569336, "logps/rejected": -59.45674133300781, "loss": 0.6238, "rewards/accuracies": 0.0, "rewards/chosen": 4.241483211517334, "rewards/margins": -0.5883111953735352, "rewards/rejected": 4.829794406890869, "step": 6246 }, { "epoch": 1.38, "learning_rate": 2.2974813132968193e-06, "logits/chosen": -1.7870172262191772, "logits/rejected": -1.7060784101486206, "logps/chosen": -49.28289031982422, "logps/rejected": -13.597975730895996, "loss": 0.0894, "rewards/accuracies": 1.0, "rewards/chosen": 3.389303684234619, "rewards/margins": 2.3867621421813965, "rewards/rejected": 1.002541422843933, "step": 6247 }, { "epoch": 1.38, "learning_rate": 2.2959735267670896e-06, "logits/chosen": -1.9837902784347534, "logits/rejected": -1.966223955154419, "logps/chosen": -34.38402557373047, "logps/rejected": -51.85884475708008, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": 4.19909143447876, "rewards/margins": 1.169569730758667, "rewards/rejected": 3.0295217037200928, "step": 6248 }, { "epoch": 1.38, "learning_rate": 2.2944660876982615e-06, "logits/chosen": -2.3185675144195557, "logits/rejected": -2.3306198120117188, "logps/chosen": -88.75115966796875, "logps/rejected": -90.91822814941406, "loss": 0.2847, "rewards/accuracies": 1.0, "rewards/chosen": 8.980674743652344, "rewards/margins": 0.42321300506591797, "rewards/rejected": 8.557461738586426, "step": 6249 }, { "epoch": 1.38, "learning_rate": 2.2929589962840375e-06, "logits/chosen": -1.8625514507293701, "logits/rejected": -1.8461577892303467, "logps/chosen": -21.918689727783203, "logps/rejected": -19.71306610107422, "loss": 1.4156, "rewards/accuracies": 1.0, "rewards/chosen": 1.1477138996124268, "rewards/margins": 0.18473893404006958, "rewards/rejected": 0.9629749655723572, "step": 6250 }, { "epoch": 1.38, "learning_rate": 2.2914522527180764e-06, "logits/chosen": -2.090390920639038, "logits/rejected": -2.1033365726470947, "logps/chosen": -55.7359619140625, "logps/rejected": -78.8233642578125, "loss": 1.337, "rewards/accuracies": 0.0, "rewards/chosen": 3.5226738452911377, "rewards/margins": -2.2617547512054443, "rewards/rejected": 5.784428596496582, "step": 6251 }, { "epoch": 1.38, "learning_rate": 2.2899458571939863e-06, "logits/chosen": -1.843339204788208, "logits/rejected": -1.8094000816345215, "logps/chosen": -66.22116088867188, "logps/rejected": -50.6616096496582, "loss": 0.0692, "rewards/accuracies": 1.0, "rewards/chosen": 6.766226291656494, "rewards/margins": 4.414252281188965, "rewards/rejected": 2.3519742488861084, "step": 6252 }, { "epoch": 1.38, "learning_rate": 2.288439809905342e-06, "logits/chosen": -1.8755227327346802, "logits/rejected": -1.9198545217514038, "logps/chosen": -28.739824295043945, "logps/rejected": -90.90321350097656, "loss": 2.4797, "rewards/accuracies": 0.0, "rewards/chosen": 4.921061992645264, "rewards/margins": -4.901081562042236, "rewards/rejected": 9.8221435546875, "step": 6253 }, { "epoch": 1.38, "learning_rate": 2.286934111045663e-06, "logits/chosen": -1.8756864070892334, "logits/rejected": -1.869529128074646, "logps/chosen": -40.15435028076172, "logps/rejected": -19.578777313232422, "loss": 1.6753, "rewards/accuracies": 1.0, "rewards/chosen": 5.869783878326416, "rewards/margins": 3.9797849655151367, "rewards/rejected": 1.8899990320205688, "step": 6254 }, { "epoch": 1.38, "learning_rate": 2.2854287608084276e-06, "logits/chosen": -2.2545576095581055, "logits/rejected": -2.207463502883911, "logps/chosen": -55.0200080871582, "logps/rejected": -77.35846710205078, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 7.053366184234619, "rewards/margins": 6.554836273193359, "rewards/rejected": 0.4985298216342926, "step": 6255 }, { "epoch": 1.38, "learning_rate": 2.283923759387071e-06, "logits/chosen": -1.9798576831817627, "logits/rejected": -1.9135220050811768, "logps/chosen": -30.475862503051758, "logps/rejected": -39.86468505859375, "loss": 0.3897, "rewards/accuracies": 0.0, "rewards/chosen": 1.8683096170425415, "rewards/margins": -0.14945614337921143, "rewards/rejected": 2.017765760421753, "step": 6256 }, { "epoch": 1.38, "learning_rate": 2.2824191069749824e-06, "logits/chosen": -1.8429371118545532, "logits/rejected": -1.754776120185852, "logps/chosen": -59.16358947753906, "logps/rejected": -8.811239242553711, "loss": 0.3187, "rewards/accuracies": 1.0, "rewards/chosen": 1.4502228498458862, "rewards/margins": 0.46735864877700806, "rewards/rejected": 0.9828642010688782, "step": 6257 }, { "epoch": 1.39, "learning_rate": 2.280914803765505e-06, "logits/chosen": -1.8538055419921875, "logits/rejected": -1.6728174686431885, "logps/chosen": -117.2425308227539, "logps/rejected": -50.93781280517578, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 8.001781463623047, "rewards/margins": 5.241856575012207, "rewards/rejected": 2.759925127029419, "step": 6258 }, { "epoch": 1.39, "learning_rate": 2.2794108499519393e-06, "logits/chosen": -1.9148107767105103, "logits/rejected": -1.9126173257827759, "logps/chosen": -43.19377899169922, "logps/rejected": -48.590538024902344, "loss": 0.3446, "rewards/accuracies": 1.0, "rewards/chosen": 4.654140472412109, "rewards/margins": 0.7085907459259033, "rewards/rejected": 3.945549726486206, "step": 6259 }, { "epoch": 1.39, "learning_rate": 2.277907245727541e-06, "logits/chosen": -1.9249792098999023, "logits/rejected": -2.0077154636383057, "logps/chosen": -69.5970458984375, "logps/rejected": -107.97008514404297, "loss": 0.541, "rewards/accuracies": 0.0, "rewards/chosen": 5.918386936187744, "rewards/margins": -0.6592569351196289, "rewards/rejected": 6.577643871307373, "step": 6260 }, { "epoch": 1.39, "learning_rate": 2.276403991285514e-06, "logits/chosen": -1.9517619609832764, "logits/rejected": -1.9761276245117188, "logps/chosen": -66.858642578125, "logps/rejected": -88.45024108886719, "loss": 0.349, "rewards/accuracies": 1.0, "rewards/chosen": 7.657199382781982, "rewards/margins": 0.9853806495666504, "rewards/rejected": 6.671818733215332, "step": 6261 }, { "epoch": 1.39, "learning_rate": 2.274901086819031e-06, "logits/chosen": -1.9025498628616333, "logits/rejected": -1.8778713941574097, "logps/chosen": -77.9287109375, "logps/rejected": -39.173648834228516, "loss": 0.1498, "rewards/accuracies": 1.0, "rewards/chosen": 3.5403008460998535, "rewards/margins": 1.292734146118164, "rewards/rejected": 2.2475666999816895, "step": 6262 }, { "epoch": 1.39, "learning_rate": 2.2733985325212034e-06, "logits/chosen": -2.0037808418273926, "logits/rejected": -2.018540620803833, "logps/chosen": -39.94709014892578, "logps/rejected": -26.887657165527344, "loss": 0.5917, "rewards/accuracies": 0.0, "rewards/chosen": 2.4440619945526123, "rewards/margins": -0.6996705532073975, "rewards/rejected": 3.1437325477600098, "step": 6263 }, { "epoch": 1.39, "learning_rate": 2.271896328585114e-06, "logits/chosen": -2.216308832168579, "logits/rejected": -1.7500622272491455, "logps/chosen": -50.86757278442383, "logps/rejected": -25.773849487304688, "loss": 1.0937, "rewards/accuracies": 1.0, "rewards/chosen": 5.02882194519043, "rewards/margins": 3.040337562561035, "rewards/rejected": 1.988484263420105, "step": 6264 }, { "epoch": 1.39, "learning_rate": 2.2703944752037854e-06, "logits/chosen": -2.0672197341918945, "logits/rejected": -2.1010818481445312, "logps/chosen": -39.85706329345703, "logps/rejected": -63.08671951293945, "loss": 0.2376, "rewards/accuracies": 1.0, "rewards/chosen": 4.470710277557373, "rewards/margins": 0.600916862487793, "rewards/rejected": 3.86979341506958, "step": 6265 }, { "epoch": 1.39, "learning_rate": 2.268892972570205e-06, "logits/chosen": -1.783511996269226, "logits/rejected": -1.626384973526001, "logps/chosen": -93.50396728515625, "logps/rejected": -50.63595962524414, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": 6.391716003417969, "rewards/margins": 3.4977519512176514, "rewards/rejected": 2.8939640522003174, "step": 6266 }, { "epoch": 1.39, "learning_rate": 2.2673918208773127e-06, "logits/chosen": -2.164165735244751, "logits/rejected": -2.164165735244751, "logps/chosen": -41.34749984741211, "logps/rejected": -41.34749984741211, "loss": 0.8097, "rewards/accuracies": 0.0, "rewards/chosen": 4.606581687927246, "rewards/margins": 0.0, "rewards/rejected": 4.606581687927246, "step": 6267 }, { "epoch": 1.39, "learning_rate": 2.2658910203180022e-06, "logits/chosen": -1.5541855096817017, "logits/rejected": -1.4905802011489868, "logps/chosen": -17.761287689208984, "logps/rejected": -9.284542083740234, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": 2.840369939804077, "rewards/margins": 2.0519320964813232, "rewards/rejected": 0.7884378433227539, "step": 6268 }, { "epoch": 1.39, "learning_rate": 2.2643905710851248e-06, "logits/chosen": -1.8053431510925293, "logits/rejected": -1.8053431510925293, "logps/chosen": -29.290958404541016, "logps/rejected": -29.290958404541016, "loss": 0.4952, "rewards/accuracies": 0.0, "rewards/chosen": 5.0528388023376465, "rewards/margins": 0.0, "rewards/rejected": 5.0528388023376465, "step": 6269 }, { "epoch": 1.39, "learning_rate": 2.2628904733714795e-06, "logits/chosen": -1.9218724966049194, "logits/rejected": -1.9218724966049194, "logps/chosen": -43.66682434082031, "logps/rejected": -43.66682434082031, "loss": 0.3506, "rewards/accuracies": 0.0, "rewards/chosen": 4.777072429656982, "rewards/margins": 0.0, "rewards/rejected": 4.777072429656982, "step": 6270 }, { "epoch": 1.39, "learning_rate": 2.261390727369832e-06, "logits/chosen": -1.956945776939392, "logits/rejected": -1.9495165348052979, "logps/chosen": -72.68994903564453, "logps/rejected": -60.28141784667969, "loss": 0.1263, "rewards/accuracies": 1.0, "rewards/chosen": 4.980144023895264, "rewards/margins": 1.3409945964813232, "rewards/rejected": 3.6391494274139404, "step": 6271 }, { "epoch": 1.39, "learning_rate": 2.259891333272889e-06, "logits/chosen": -1.9713835716247559, "logits/rejected": -1.9713835716247559, "logps/chosen": -15.979604721069336, "logps/rejected": -15.979604721069336, "loss": 0.965, "rewards/accuracies": 0.0, "rewards/chosen": 1.4640913009643555, "rewards/margins": 0.0, "rewards/rejected": 1.4640913009643555, "step": 6272 }, { "epoch": 1.39, "learning_rate": 2.258392291273326e-06, "logits/chosen": -2.045468807220459, "logits/rejected": -1.9848212003707886, "logps/chosen": -75.08683776855469, "logps/rejected": -68.31694030761719, "loss": 0.215, "rewards/accuracies": 1.0, "rewards/chosen": 6.410337924957275, "rewards/margins": 1.3026552200317383, "rewards/rejected": 5.107682704925537, "step": 6273 }, { "epoch": 1.39, "learning_rate": 2.256893601563761e-06, "logits/chosen": -2.0663645267486572, "logits/rejected": -1.6498626470565796, "logps/chosen": -81.41278076171875, "logps/rejected": -52.43211364746094, "loss": 0.0692, "rewards/accuracies": 1.0, "rewards/chosen": 7.016725063323975, "rewards/margins": 3.254485845565796, "rewards/rejected": 3.7622392177581787, "step": 6274 }, { "epoch": 1.39, "learning_rate": 2.255395264336774e-06, "logits/chosen": -1.9066715240478516, "logits/rejected": -1.877771258354187, "logps/chosen": -44.89125061035156, "logps/rejected": -43.81370544433594, "loss": 0.5629, "rewards/accuracies": 1.0, "rewards/chosen": 2.9794723987579346, "rewards/margins": 0.6139588356018066, "rewards/rejected": 2.365513563156128, "step": 6275 }, { "epoch": 1.39, "learning_rate": 2.2538972797849002e-06, "logits/chosen": -2.1738264560699463, "logits/rejected": -2.159252643585205, "logps/chosen": -64.78524780273438, "logps/rejected": -48.46355438232422, "loss": 0.2211, "rewards/accuracies": 1.0, "rewards/chosen": 4.664224147796631, "rewards/margins": 0.6198177337646484, "rewards/rejected": 4.044406414031982, "step": 6276 }, { "epoch": 1.39, "learning_rate": 2.2523996481006205e-06, "logits/chosen": -1.6563971042633057, "logits/rejected": -1.5682399272918701, "logps/chosen": -49.425899505615234, "logps/rejected": -55.70572280883789, "loss": 0.2383, "rewards/accuracies": 1.0, "rewards/chosen": 4.062837600708008, "rewards/margins": 1.769895076751709, "rewards/rejected": 2.292942523956299, "step": 6277 }, { "epoch": 1.39, "learning_rate": 2.2509023694763844e-06, "logits/chosen": -1.9293204545974731, "logits/rejected": -1.2107489109039307, "logps/chosen": -86.82232666015625, "logps/rejected": -83.44625091552734, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": 7.674211025238037, "rewards/margins": 2.563683032989502, "rewards/rejected": 5.110527992248535, "step": 6278 }, { "epoch": 1.39, "learning_rate": 2.2494054441045813e-06, "logits/chosen": -2.019418716430664, "logits/rejected": -1.9119901657104492, "logps/chosen": -59.04552459716797, "logps/rejected": -30.345733642578125, "loss": 0.343, "rewards/accuracies": 1.0, "rewards/chosen": 3.5784194469451904, "rewards/margins": 0.6291513442993164, "rewards/rejected": 2.949268102645874, "step": 6279 }, { "epoch": 1.39, "learning_rate": 2.247908872177571e-06, "logits/chosen": -1.7307449579238892, "logits/rejected": -1.7119951248168945, "logps/chosen": -39.21318817138672, "logps/rejected": -44.027442932128906, "loss": 0.305, "rewards/accuracies": 1.0, "rewards/chosen": 2.6591286659240723, "rewards/margins": 0.17460942268371582, "rewards/rejected": 2.4845192432403564, "step": 6280 }, { "epoch": 1.39, "learning_rate": 2.24641265388765e-06, "logits/chosen": -2.1302995681762695, "logits/rejected": -2.141409397125244, "logps/chosen": -29.289514541625977, "logps/rejected": -74.8089370727539, "loss": 1.2428, "rewards/accuracies": 0.0, "rewards/chosen": 5.7475457191467285, "rewards/margins": -2.394318103790283, "rewards/rejected": 8.141863822937012, "step": 6281 }, { "epoch": 1.39, "learning_rate": 2.244916789427088e-06, "logits/chosen": -1.9621555805206299, "logits/rejected": -1.8821561336517334, "logps/chosen": -75.1959457397461, "logps/rejected": -40.30730438232422, "loss": 0.1131, "rewards/accuracies": 1.0, "rewards/chosen": 5.546680450439453, "rewards/margins": 1.4142975807189941, "rewards/rejected": 4.132382869720459, "step": 6282 }, { "epoch": 1.39, "learning_rate": 2.2434212789880935e-06, "logits/chosen": -1.8458870649337769, "logits/rejected": -1.8444305658340454, "logps/chosen": -69.5766372680664, "logps/rejected": -47.333404541015625, "loss": 0.1595, "rewards/accuracies": 1.0, "rewards/chosen": 4.400464057922363, "rewards/margins": 1.0224213600158691, "rewards/rejected": 3.378042697906494, "step": 6283 }, { "epoch": 1.39, "learning_rate": 2.2419261227628376e-06, "logits/chosen": -1.9661542177200317, "logits/rejected": -1.9351853132247925, "logps/chosen": -105.94770812988281, "logps/rejected": -82.75047302246094, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 7.350505352020264, "rewards/margins": 5.136566162109375, "rewards/rejected": 2.2139389514923096, "step": 6284 }, { "epoch": 1.39, "learning_rate": 2.2404313209434468e-06, "logits/chosen": -1.7951337099075317, "logits/rejected": -1.7287839651107788, "logps/chosen": -45.16388702392578, "logps/rejected": -21.699167251586914, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 5.9375529289245605, "rewards/margins": 3.1489930152893066, "rewards/rejected": 2.788559913635254, "step": 6285 }, { "epoch": 1.39, "learning_rate": 2.238936873721994e-06, "logits/chosen": -2.0373787879943848, "logits/rejected": -2.001948833465576, "logps/chosen": -83.42327880859375, "logps/rejected": -58.813232421875, "loss": 0.1408, "rewards/accuracies": 1.0, "rewards/chosen": 8.239028930664062, "rewards/margins": 2.7164220809936523, "rewards/rejected": 5.52260684967041, "step": 6286 }, { "epoch": 1.39, "learning_rate": 2.237442781290519e-06, "logits/chosen": -1.9608783721923828, "logits/rejected": -1.9608783721923828, "logps/chosen": -39.31308364868164, "logps/rejected": -39.31308364868164, "loss": 0.3956, "rewards/accuracies": 0.0, "rewards/chosen": 5.037968158721924, "rewards/margins": 0.0, "rewards/rejected": 5.037968158721924, "step": 6287 }, { "epoch": 1.39, "learning_rate": 2.235949043841002e-06, "logits/chosen": -2.269327163696289, "logits/rejected": -2.251701593399048, "logps/chosen": -47.494483947753906, "logps/rejected": -92.89315795898438, "loss": 0.5263, "rewards/accuracies": 0.0, "rewards/chosen": 5.074155330657959, "rewards/margins": -0.28406763076782227, "rewards/rejected": 5.358222961425781, "step": 6288 }, { "epoch": 1.39, "learning_rate": 2.234455661565392e-06, "logits/chosen": -2.091369867324829, "logits/rejected": -1.9735612869262695, "logps/chosen": -141.89471435546875, "logps/rejected": -80.22174072265625, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 7.427188396453857, "rewards/margins": 3.3316287994384766, "rewards/rejected": 4.095559597015381, "step": 6289 }, { "epoch": 1.39, "learning_rate": 2.2329626346555788e-06, "logits/chosen": -1.8579092025756836, "logits/rejected": -1.8393263816833496, "logps/chosen": -52.225059509277344, "logps/rejected": -56.8216438293457, "loss": 0.1102, "rewards/accuracies": 1.0, "rewards/chosen": 3.752883195877075, "rewards/margins": 1.4560236930847168, "rewards/rejected": 2.2968595027923584, "step": 6290 }, { "epoch": 1.39, "learning_rate": 2.231469963303416e-06, "logits/chosen": -1.8488740921020508, "logits/rejected": -1.8844552040100098, "logps/chosen": -58.46107482910156, "logps/rejected": -101.45992279052734, "loss": 0.2595, "rewards/accuracies": 1.0, "rewards/chosen": 4.122807502746582, "rewards/margins": 0.5094292163848877, "rewards/rejected": 3.6133782863616943, "step": 6291 }, { "epoch": 1.39, "learning_rate": 2.2299776477007073e-06, "logits/chosen": -2.189162015914917, "logits/rejected": -2.1006929874420166, "logps/chosen": -27.526809692382812, "logps/rejected": -9.274689674377441, "loss": 0.0605, "rewards/accuracies": 1.0, "rewards/chosen": 4.859286785125732, "rewards/margins": 3.0371270179748535, "rewards/rejected": 1.8221596479415894, "step": 6292 }, { "epoch": 1.39, "learning_rate": 2.228485688039212e-06, "logits/chosen": -2.1296961307525635, "logits/rejected": -2.088245153427124, "logps/chosen": -128.4221649169922, "logps/rejected": -146.98199462890625, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 9.267463684082031, "rewards/margins": 3.2368969917297363, "rewards/rejected": 6.030566692352295, "step": 6293 }, { "epoch": 1.39, "learning_rate": 2.226994084510643e-06, "logits/chosen": -2.1832892894744873, "logits/rejected": -2.0928852558135986, "logps/chosen": -113.52783966064453, "logps/rejected": -34.91040802001953, "loss": 0.7579, "rewards/accuracies": 1.0, "rewards/chosen": 7.141631603240967, "rewards/margins": 3.9369800090789795, "rewards/rejected": 3.2046515941619873, "step": 6294 }, { "epoch": 1.39, "learning_rate": 2.2255028373066684e-06, "logits/chosen": -1.9202640056610107, "logits/rejected": -1.9389063119888306, "logps/chosen": -66.65223693847656, "logps/rejected": -84.5431900024414, "loss": 0.5256, "rewards/accuracies": 0.0, "rewards/chosen": 8.326654434204102, "rewards/margins": -0.4143562316894531, "rewards/rejected": 8.741010665893555, "step": 6295 }, { "epoch": 1.39, "learning_rate": 2.2240119466189114e-06, "logits/chosen": -1.8380188941955566, "logits/rejected": -1.8594996929168701, "logps/chosen": -107.3027572631836, "logps/rejected": -107.50497436523438, "loss": 0.1706, "rewards/accuracies": 1.0, "rewards/chosen": 6.129715919494629, "rewards/margins": 1.0560965538024902, "rewards/rejected": 5.073619365692139, "step": 6296 }, { "epoch": 1.39, "learning_rate": 2.222521412638942e-06, "logits/chosen": -1.949816346168518, "logits/rejected": -1.865329384803772, "logps/chosen": -76.7448959350586, "logps/rejected": -46.819488525390625, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 8.644933700561523, "rewards/margins": 4.3781352043151855, "rewards/rejected": 4.266798496246338, "step": 6297 }, { "epoch": 1.39, "learning_rate": 2.221031235558298e-06, "logits/chosen": -1.9072338342666626, "logits/rejected": -1.9021923542022705, "logps/chosen": -77.98754119873047, "logps/rejected": -113.6105728149414, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": 11.717728614807129, "rewards/margins": 2.112551689147949, "rewards/rejected": 9.60517692565918, "step": 6298 }, { "epoch": 1.39, "learning_rate": 2.219541415568458e-06, "logits/chosen": -2.169443368911743, "logits/rejected": -2.127408027648926, "logps/chosen": -112.80701446533203, "logps/rejected": -118.00950622558594, "loss": 0.343, "rewards/accuracies": 1.0, "rewards/chosen": 11.019030570983887, "rewards/margins": 0.09830474853515625, "rewards/rejected": 10.92072582244873, "step": 6299 }, { "epoch": 1.39, "learning_rate": 2.218051952860862e-06, "logits/chosen": -2.0533909797668457, "logits/rejected": -2.046110153198242, "logps/chosen": -55.68429183959961, "logps/rejected": -48.60176086425781, "loss": 1.0347, "rewards/accuracies": 0.0, "rewards/chosen": 1.0598820447921753, "rewards/margins": -1.9241451025009155, "rewards/rejected": 2.984027147293091, "step": 6300 }, { "epoch": 1.39, "learning_rate": 2.2165628476269023e-06, "logits/chosen": -1.8152796030044556, "logits/rejected": -1.8683737516403198, "logps/chosen": -62.063621520996094, "logps/rejected": -130.53768920898438, "loss": 0.1857, "rewards/accuracies": 1.0, "rewards/chosen": 7.928739070892334, "rewards/margins": 0.9619865417480469, "rewards/rejected": 6.966752529144287, "step": 6301 }, { "epoch": 1.39, "learning_rate": 2.215074100057926e-06, "logits/chosen": -1.7525572776794434, "logits/rejected": -1.54521644115448, "logps/chosen": -66.30552673339844, "logps/rejected": -31.243803024291992, "loss": 0.9822, "rewards/accuracies": 1.0, "rewards/chosen": 3.2053635120391846, "rewards/margins": 2.2838919162750244, "rewards/rejected": 0.9214715957641602, "step": 6302 }, { "epoch": 1.4, "learning_rate": 2.213585710345232e-06, "logits/chosen": -1.740884780883789, "logits/rejected": -1.6062037944793701, "logps/chosen": -52.82210159301758, "logps/rejected": -12.747600555419922, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 3.83008074760437, "rewards/margins": 3.064572334289551, "rewards/rejected": 0.7655084729194641, "step": 6303 }, { "epoch": 1.4, "learning_rate": 2.212097678680077e-06, "logits/chosen": -2.2909040451049805, "logits/rejected": -2.307011842727661, "logps/chosen": -75.90274047851562, "logps/rejected": -16.144479751586914, "loss": 0.349, "rewards/accuracies": 1.0, "rewards/chosen": 3.9844863414764404, "rewards/margins": 0.6229324340820312, "rewards/rejected": 3.361553907394409, "step": 6304 }, { "epoch": 1.4, "learning_rate": 2.2106100052536693e-06, "logits/chosen": -1.786059021949768, "logits/rejected": -1.8508645296096802, "logps/chosen": -66.17268371582031, "logps/rejected": -72.10643768310547, "loss": 2.1994, "rewards/accuracies": 0.0, "rewards/chosen": 5.174902439117432, "rewards/margins": -4.349503040313721, "rewards/rejected": 9.524405479431152, "step": 6305 }, { "epoch": 1.4, "learning_rate": 2.2091226902571673e-06, "logits/chosen": -1.7440905570983887, "logits/rejected": -1.8052750825881958, "logps/chosen": -41.2724723815918, "logps/rejected": -107.18605041503906, "loss": 1.7905, "rewards/accuracies": 0.0, "rewards/chosen": 4.558953762054443, "rewards/margins": -2.8127808570861816, "rewards/rejected": 7.371734619140625, "step": 6306 }, { "epoch": 1.4, "learning_rate": 2.207635733881694e-06, "logits/chosen": -1.9303404092788696, "logits/rejected": -1.9178098440170288, "logps/chosen": -122.13229370117188, "logps/rejected": -105.30084228515625, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 10.6922025680542, "rewards/margins": 3.4825892448425293, "rewards/rejected": 7.20961332321167, "step": 6307 }, { "epoch": 1.4, "learning_rate": 2.206149136318314e-06, "logits/chosen": -1.6488912105560303, "logits/rejected": -1.6321486234664917, "logps/chosen": -60.79273223876953, "logps/rejected": -60.30951690673828, "loss": 0.1201, "rewards/accuracies": 1.0, "rewards/chosen": 4.120725154876709, "rewards/margins": 1.7906935214996338, "rewards/rejected": 2.330031633377075, "step": 6308 }, { "epoch": 1.4, "learning_rate": 2.204662897758054e-06, "logits/chosen": -1.7188152074813843, "logits/rejected": -1.7033727169036865, "logps/chosen": -25.485462188720703, "logps/rejected": -50.22751235961914, "loss": 0.3557, "rewards/accuracies": 1.0, "rewards/chosen": 3.6110405921936035, "rewards/margins": 0.09944272041320801, "rewards/rejected": 3.5115978717803955, "step": 6309 }, { "epoch": 1.4, "learning_rate": 2.2031770183918927e-06, "logits/chosen": -2.2181859016418457, "logits/rejected": -2.1963727474212646, "logps/chosen": -63.6710090637207, "logps/rejected": -65.8840560913086, "loss": 0.2974, "rewards/accuracies": 1.0, "rewards/chosen": 4.9787397384643555, "rewards/margins": 0.6681437492370605, "rewards/rejected": 4.310595989227295, "step": 6310 }, { "epoch": 1.4, "learning_rate": 2.20169149841076e-06, "logits/chosen": -1.7830893993377686, "logits/rejected": -1.7811925411224365, "logps/chosen": -47.72794723510742, "logps/rejected": -51.238128662109375, "loss": 0.2607, "rewards/accuracies": 1.0, "rewards/chosen": 4.212677478790283, "rewards/margins": 0.3860204219818115, "rewards/rejected": 3.8266570568084717, "step": 6311 }, { "epoch": 1.4, "learning_rate": 2.2002063380055434e-06, "logits/chosen": -1.7430275678634644, "logits/rejected": -1.7213082313537598, "logps/chosen": -11.892326354980469, "logps/rejected": -3.094510793685913, "loss": 0.1061, "rewards/accuracies": 1.0, "rewards/chosen": 1.7986217737197876, "rewards/margins": 1.4486504793167114, "rewards/rejected": 0.34997132420539856, "step": 6312 }, { "epoch": 1.4, "learning_rate": 2.1987215373670815e-06, "logits/chosen": -1.9749442338943481, "logits/rejected": -1.9679449796676636, "logps/chosen": -84.67185974121094, "logps/rejected": -99.40867614746094, "loss": 0.0775, "rewards/accuracies": 1.0, "rewards/chosen": 8.015504837036133, "rewards/margins": 1.7939000129699707, "rewards/rejected": 6.221604824066162, "step": 6313 }, { "epoch": 1.4, "learning_rate": 2.1972370966861685e-06, "logits/chosen": -1.9665987491607666, "logits/rejected": -1.997375249862671, "logps/chosen": -82.45791625976562, "logps/rejected": -96.26020812988281, "loss": 0.1482, "rewards/accuracies": 1.0, "rewards/chosen": 9.99675178527832, "rewards/margins": 1.109750747680664, "rewards/rejected": 8.887001037597656, "step": 6314 }, { "epoch": 1.4, "learning_rate": 2.1957530161535506e-06, "logits/chosen": -1.786139965057373, "logits/rejected": -1.8138186931610107, "logps/chosen": -50.97722625732422, "logps/rejected": -50.65422439575195, "loss": 0.6213, "rewards/accuracies": 0.0, "rewards/chosen": 2.959376573562622, "rewards/margins": -0.682816743850708, "rewards/rejected": 3.64219331741333, "step": 6315 }, { "epoch": 1.4, "learning_rate": 2.194269295959931e-06, "logits/chosen": -1.6907684803009033, "logits/rejected": -1.6907684803009033, "logps/chosen": -19.294612884521484, "logps/rejected": -19.294612884521484, "loss": 0.3513, "rewards/accuracies": 0.0, "rewards/chosen": 2.6955761909484863, "rewards/margins": 0.0, "rewards/rejected": 2.6955761909484863, "step": 6316 }, { "epoch": 1.4, "learning_rate": 2.19278593629596e-06, "logits/chosen": -1.7550708055496216, "logits/rejected": -1.6815383434295654, "logps/chosen": -82.84739685058594, "logps/rejected": -69.22544860839844, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": 5.49313497543335, "rewards/margins": 4.030309677124023, "rewards/rejected": 1.462825059890747, "step": 6317 }, { "epoch": 1.4, "learning_rate": 2.1913029373522476e-06, "logits/chosen": -1.8195792436599731, "logits/rejected": -1.8195792436599731, "logps/chosen": -42.92089080810547, "logps/rejected": -42.92089080810547, "loss": 0.433, "rewards/accuracies": 0.0, "rewards/chosen": 8.124202728271484, "rewards/margins": 0.0, "rewards/rejected": 8.124202728271484, "step": 6318 }, { "epoch": 1.4, "learning_rate": 2.1898202993193562e-06, "logits/chosen": -1.7962895631790161, "logits/rejected": -1.7057181596755981, "logps/chosen": -48.346920013427734, "logps/rejected": -41.170475006103516, "loss": 0.1938, "rewards/accuracies": 1.0, "rewards/chosen": 3.0732128620147705, "rewards/margins": 2.6157898902893066, "rewards/rejected": 0.45742303133010864, "step": 6319 }, { "epoch": 1.4, "learning_rate": 2.1883380223878004e-06, "logits/chosen": -1.8679646253585815, "logits/rejected": -1.3531601428985596, "logps/chosen": -27.19049644470215, "logps/rejected": -40.432090759277344, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 3.4185829162597656, "rewards/margins": -1.1037096977233887, "rewards/rejected": 4.522292613983154, "step": 6320 }, { "epoch": 1.4, "learning_rate": 2.1868561067480504e-06, "logits/chosen": -1.7428979873657227, "logits/rejected": -1.6482574939727783, "logps/chosen": -118.17659759521484, "logps/rejected": -52.83045196533203, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": 9.739994049072266, "rewards/margins": 4.701180934906006, "rewards/rejected": 5.03881311416626, "step": 6321 }, { "epoch": 1.4, "learning_rate": 2.1853745525905275e-06, "logits/chosen": -1.9513558149337769, "logits/rejected": -1.9513558149337769, "logps/chosen": -38.237525939941406, "logps/rejected": -38.237525939941406, "loss": 0.3585, "rewards/accuracies": 0.0, "rewards/chosen": 3.852313995361328, "rewards/margins": 0.0, "rewards/rejected": 3.852313995361328, "step": 6322 }, { "epoch": 1.4, "learning_rate": 2.1838933601056085e-06, "logits/chosen": -1.5304179191589355, "logits/rejected": -1.4987937211990356, "logps/chosen": -52.78072738647461, "logps/rejected": -37.49456787109375, "loss": 0.408, "rewards/accuracies": 1.0, "rewards/chosen": 3.332972288131714, "rewards/margins": 0.14387226104736328, "rewards/rejected": 3.1891000270843506, "step": 6323 }, { "epoch": 1.4, "learning_rate": 2.182412529483623e-06, "logits/chosen": -1.870508074760437, "logits/rejected": -1.7969067096710205, "logps/chosen": -41.10474395751953, "logps/rejected": -24.172883987426758, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": 3.881702423095703, "rewards/margins": 3.1653025150299072, "rewards/rejected": 0.7163999676704407, "step": 6324 }, { "epoch": 1.4, "learning_rate": 2.1809320609148544e-06, "logits/chosen": -2.15966534614563, "logits/rejected": -2.1578028202056885, "logps/chosen": -38.96381378173828, "logps/rejected": -59.77648162841797, "loss": 0.6363, "rewards/accuracies": 1.0, "rewards/chosen": 4.738320350646973, "rewards/margins": 0.2419877052307129, "rewards/rejected": 4.49633264541626, "step": 6325 }, { "epoch": 1.4, "learning_rate": 2.179451954589541e-06, "logits/chosen": -2.242741823196411, "logits/rejected": -2.2428088188171387, "logps/chosen": -58.58610153198242, "logps/rejected": -82.68392181396484, "loss": 0.3077, "rewards/accuracies": 1.0, "rewards/chosen": 4.145262718200684, "rewards/margins": 0.40551114082336426, "rewards/rejected": 3.7397515773773193, "step": 6326 }, { "epoch": 1.4, "learning_rate": 2.1779722106978683e-06, "logits/chosen": -1.5366230010986328, "logits/rejected": -1.5366230010986328, "logps/chosen": -33.87831497192383, "logps/rejected": -33.87831497192383, "loss": 0.5932, "rewards/accuracies": 0.0, "rewards/chosen": 1.0179637670516968, "rewards/margins": 0.0, "rewards/rejected": 1.0179637670516968, "step": 6327 }, { "epoch": 1.4, "learning_rate": 2.1764928294299835e-06, "logits/chosen": -1.46510910987854, "logits/rejected": -1.4941338300704956, "logps/chosen": -29.331096649169922, "logps/rejected": -50.24727249145508, "loss": 0.4877, "rewards/accuracies": 1.0, "rewards/chosen": 3.7172138690948486, "rewards/margins": 0.7494025230407715, "rewards/rejected": 2.967811346054077, "step": 6328 }, { "epoch": 1.4, "learning_rate": 2.175013810975982e-06, "logits/chosen": -2.02514386177063, "logits/rejected": -2.029005527496338, "logps/chosen": -48.31262969970703, "logps/rejected": -50.30073928833008, "loss": 0.1699, "rewards/accuracies": 1.0, "rewards/chosen": 5.4351677894592285, "rewards/margins": 1.6504437923431396, "rewards/rejected": 3.784723997116089, "step": 6329 }, { "epoch": 1.4, "learning_rate": 2.1735351555259153e-06, "logits/chosen": -1.7720268964767456, "logits/rejected": -1.704885721206665, "logps/chosen": -61.8072395324707, "logps/rejected": -139.32135009765625, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 7.242895603179932, "rewards/margins": 3.167452812194824, "rewards/rejected": 4.075442790985107, "step": 6330 }, { "epoch": 1.4, "learning_rate": 2.172056863269786e-06, "logits/chosen": -1.9641344547271729, "logits/rejected": -1.6400775909423828, "logps/chosen": -112.3358154296875, "logps/rejected": -57.304840087890625, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": 6.994102478027344, "rewards/margins": 2.9204792976379395, "rewards/rejected": 4.073623180389404, "step": 6331 }, { "epoch": 1.4, "learning_rate": 2.170578934397552e-06, "logits/chosen": -1.9966667890548706, "logits/rejected": -1.958165168762207, "logps/chosen": -80.49229431152344, "logps/rejected": -118.37488555908203, "loss": 0.0893, "rewards/accuracies": 1.0, "rewards/chosen": 6.070927619934082, "rewards/margins": 2.2107675075531006, "rewards/rejected": 3.8601601123809814, "step": 6332 }, { "epoch": 1.4, "learning_rate": 2.169101369099123e-06, "logits/chosen": -2.1300208568573, "logits/rejected": -2.1217715740203857, "logps/chosen": -35.29383850097656, "logps/rejected": -54.77991485595703, "loss": 0.1827, "rewards/accuracies": 1.0, "rewards/chosen": 4.235938549041748, "rewards/margins": 1.0265817642211914, "rewards/rejected": 3.2093567848205566, "step": 6333 }, { "epoch": 1.4, "learning_rate": 2.1676241675643627e-06, "logits/chosen": -2.0136070251464844, "logits/rejected": -1.8952351808547974, "logps/chosen": -61.88034439086914, "logps/rejected": -15.496801376342773, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 4.1463942527771, "rewards/margins": 3.067939043045044, "rewards/rejected": 1.0784552097320557, "step": 6334 }, { "epoch": 1.4, "learning_rate": 2.1661473299830905e-06, "logits/chosen": -2.108769178390503, "logits/rejected": -2.109999656677246, "logps/chosen": -41.241790771484375, "logps/rejected": -66.34825897216797, "loss": 0.1306, "rewards/accuracies": 1.0, "rewards/chosen": 3.398815155029297, "rewards/margins": 1.2182388305664062, "rewards/rejected": 2.1805763244628906, "step": 6335 }, { "epoch": 1.4, "learning_rate": 2.164670856545071e-06, "logits/chosen": -1.9624942541122437, "logits/rejected": -1.9219142198562622, "logps/chosen": -69.54902648925781, "logps/rejected": -115.79736328125, "loss": 2.1999, "rewards/accuracies": 1.0, "rewards/chosen": 7.312705993652344, "rewards/margins": 4.116380214691162, "rewards/rejected": 3.1963257789611816, "step": 6336 }, { "epoch": 1.4, "learning_rate": 2.1631947474400315e-06, "logits/chosen": -2.0004336833953857, "logits/rejected": -2.0168943405151367, "logps/chosen": -23.515933990478516, "logps/rejected": -46.13914489746094, "loss": 1.0526, "rewards/accuracies": 0.0, "rewards/chosen": 2.1959340572357178, "rewards/margins": -1.8256800174713135, "rewards/rejected": 4.021614074707031, "step": 6337 }, { "epoch": 1.4, "learning_rate": 2.161719002857647e-06, "logits/chosen": -1.9765393733978271, "logits/rejected": -1.851563572883606, "logps/chosen": -56.54115676879883, "logps/rejected": -38.95096206665039, "loss": 0.1074, "rewards/accuracies": 1.0, "rewards/chosen": 2.439727544784546, "rewards/margins": 1.880600094795227, "rewards/rejected": 0.5591274499893188, "step": 6338 }, { "epoch": 1.4, "learning_rate": 2.160243622987548e-06, "logits/chosen": -1.9705891609191895, "logits/rejected": -1.9466546773910522, "logps/chosen": -62.110626220703125, "logps/rejected": -43.78668212890625, "loss": 0.1065, "rewards/accuracies": 1.0, "rewards/chosen": 4.211278438568115, "rewards/margins": 1.7068166732788086, "rewards/rejected": 2.5044617652893066, "step": 6339 }, { "epoch": 1.4, "learning_rate": 2.1587686080193158e-06, "logits/chosen": -1.8526138067245483, "logits/rejected": -1.866929292678833, "logps/chosen": -30.029088973999023, "logps/rejected": -45.913455963134766, "loss": 0.1176, "rewards/accuracies": 1.0, "rewards/chosen": 4.025400638580322, "rewards/margins": 1.4006428718566895, "rewards/rejected": 2.624757766723633, "step": 6340 }, { "epoch": 1.4, "learning_rate": 2.1572939581424883e-06, "logits/chosen": -1.940158724784851, "logits/rejected": -1.8450934886932373, "logps/chosen": -48.61949920654297, "logps/rejected": -14.688257217407227, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 5.849219799041748, "rewards/margins": 3.265212297439575, "rewards/rejected": 2.584007501602173, "step": 6341 }, { "epoch": 1.4, "learning_rate": 2.1558196735465555e-06, "logits/chosen": -1.8026570081710815, "logits/rejected": -1.7805098295211792, "logps/chosen": -77.62037658691406, "logps/rejected": -76.57774353027344, "loss": 0.1813, "rewards/accuracies": 1.0, "rewards/chosen": 4.528240203857422, "rewards/margins": 0.9966132640838623, "rewards/rejected": 3.5316269397735596, "step": 6342 }, { "epoch": 1.4, "learning_rate": 2.154345754420953e-06, "logits/chosen": -1.8045563697814941, "logits/rejected": -1.5118366479873657, "logps/chosen": -151.78353881835938, "logps/rejected": -36.64617919921875, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 6.890435695648193, "rewards/margins": 3.7142393589019775, "rewards/rejected": 3.176196336746216, "step": 6343 }, { "epoch": 1.4, "learning_rate": 2.1528722009550844e-06, "logits/chosen": -2.2003400325775146, "logits/rejected": -2.134930372238159, "logps/chosen": -100.06021881103516, "logps/rejected": -72.19489288330078, "loss": 0.0587, "rewards/accuracies": 1.0, "rewards/chosen": 7.414687633514404, "rewards/margins": 2.7472658157348633, "rewards/rejected": 4.667421817779541, "step": 6344 }, { "epoch": 1.4, "learning_rate": 2.15139901333829e-06, "logits/chosen": -2.303784132003784, "logits/rejected": -2.3129286766052246, "logps/chosen": -34.33818435668945, "logps/rejected": -74.22760009765625, "loss": 0.9764, "rewards/accuracies": 1.0, "rewards/chosen": 3.9273815155029297, "rewards/margins": 0.0010638236999511719, "rewards/rejected": 3.9263176918029785, "step": 6345 }, { "epoch": 1.4, "learning_rate": 2.149926191759878e-06, "logits/chosen": -1.8711248636245728, "logits/rejected": -1.7400037050247192, "logps/chosen": -167.91497802734375, "logps/rejected": -21.339000701904297, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 5.679408550262451, "rewards/margins": 5.038963317871094, "rewards/rejected": 0.640445351600647, "step": 6346 }, { "epoch": 1.4, "learning_rate": 2.1484537364090976e-06, "logits/chosen": -2.058816432952881, "logits/rejected": -2.080756664276123, "logps/chosen": -37.378623962402344, "logps/rejected": -62.498199462890625, "loss": 1.2313, "rewards/accuracies": 0.0, "rewards/chosen": 4.8501715660095215, "rewards/margins": -2.2667016983032227, "rewards/rejected": 7.116873264312744, "step": 6347 }, { "epoch": 1.41, "learning_rate": 2.1469816474751566e-06, "logits/chosen": -1.731679916381836, "logits/rejected": -1.7995027303695679, "logps/chosen": -17.0426025390625, "logps/rejected": -55.79713439941406, "loss": 1.2592, "rewards/accuracies": 0.0, "rewards/chosen": 1.0227108001708984, "rewards/margins": -2.2964444160461426, "rewards/rejected": 3.319155216217041, "step": 6348 }, { "epoch": 1.41, "learning_rate": 2.1455099251472155e-06, "logits/chosen": -1.8623132705688477, "logits/rejected": -1.6204791069030762, "logps/chosen": -66.7411117553711, "logps/rejected": -38.2943000793457, "loss": 0.0702, "rewards/accuracies": 1.0, "rewards/chosen": 7.705258846282959, "rewards/margins": 3.245708465576172, "rewards/rejected": 4.459550380706787, "step": 6349 }, { "epoch": 1.41, "learning_rate": 2.1440385696143875e-06, "logits/chosen": -1.8996763229370117, "logits/rejected": -1.8748271465301514, "logps/chosen": -55.904457092285156, "logps/rejected": -39.161094665527344, "loss": 0.1659, "rewards/accuracies": 1.0, "rewards/chosen": 4.8706536293029785, "rewards/margins": 1.3644256591796875, "rewards/rejected": 3.506227970123291, "step": 6350 }, { "epoch": 1.41, "learning_rate": 2.1425675810657387e-06, "logits/chosen": -2.1110665798187256, "logits/rejected": -2.084526777267456, "logps/chosen": -79.77389526367188, "logps/rejected": -175.35162353515625, "loss": 0.153, "rewards/accuracies": 1.0, "rewards/chosen": 7.892032146453857, "rewards/margins": 1.1694154739379883, "rewards/rejected": 6.722616672515869, "step": 6351 }, { "epoch": 1.41, "learning_rate": 2.141096959690284e-06, "logits/chosen": -1.857487678527832, "logits/rejected": -1.9085173606872559, "logps/chosen": -55.28130340576172, "logps/rejected": -88.89274597167969, "loss": 0.921, "rewards/accuracies": 0.0, "rewards/chosen": 3.9846558570861816, "rewards/margins": -1.4044723510742188, "rewards/rejected": 5.3891282081604, "step": 6352 }, { "epoch": 1.41, "learning_rate": 2.1396267056770004e-06, "logits/chosen": -1.6694753170013428, "logits/rejected": -1.6351189613342285, "logps/chosen": -38.191650390625, "logps/rejected": -39.05990219116211, "loss": 0.2893, "rewards/accuracies": 1.0, "rewards/chosen": 4.8831682205200195, "rewards/margins": 0.5116634368896484, "rewards/rejected": 4.371504783630371, "step": 6353 }, { "epoch": 1.41, "learning_rate": 2.1381568192148057e-06, "logits/chosen": -1.7847687005996704, "logits/rejected": -1.7825241088867188, "logps/chosen": -62.65520095825195, "logps/rejected": -77.5557861328125, "loss": 0.4014, "rewards/accuracies": 1.0, "rewards/chosen": 5.638383388519287, "rewards/margins": 0.3202681541442871, "rewards/rejected": 5.318115234375, "step": 6354 }, { "epoch": 1.41, "learning_rate": 2.136687300492583e-06, "logits/chosen": -2.1404573917388916, "logits/rejected": -2.1269986629486084, "logps/chosen": -68.2775650024414, "logps/rejected": -85.10684204101562, "loss": 0.0612, "rewards/accuracies": 1.0, "rewards/chosen": 8.821455001831055, "rewards/margins": 2.6667675971984863, "rewards/rejected": 6.154687404632568, "step": 6355 }, { "epoch": 1.41, "learning_rate": 2.1352181496991575e-06, "logits/chosen": -1.8463388681411743, "logits/rejected": -1.8538941144943237, "logps/chosen": -33.74308776855469, "logps/rejected": -39.10503387451172, "loss": 1.0761, "rewards/accuracies": 0.0, "rewards/chosen": 3.2492005825042725, "rewards/margins": -1.5460069179534912, "rewards/rejected": 4.795207500457764, "step": 6356 }, { "epoch": 1.41, "learning_rate": 2.1337493670233133e-06, "logits/chosen": -1.8462127447128296, "logits/rejected": -1.7937653064727783, "logps/chosen": -46.37474060058594, "logps/rejected": -44.12303161621094, "loss": 0.1513, "rewards/accuracies": 1.0, "rewards/chosen": 4.247061252593994, "rewards/margins": 1.3072013854980469, "rewards/rejected": 2.9398598670959473, "step": 6357 }, { "epoch": 1.41, "learning_rate": 2.132280952653785e-06, "logits/chosen": -1.9688599109649658, "logits/rejected": -1.9500375986099243, "logps/chosen": -69.44778442382812, "logps/rejected": -87.661865234375, "loss": 0.7955, "rewards/accuracies": 0.0, "rewards/chosen": 6.5252227783203125, "rewards/margins": -1.278552532196045, "rewards/rejected": 7.803775310516357, "step": 6358 }, { "epoch": 1.41, "learning_rate": 2.1308129067792606e-06, "logits/chosen": -1.6942648887634277, "logits/rejected": -1.6011284589767456, "logps/chosen": -96.5546875, "logps/rejected": -36.51609802246094, "loss": 0.0797, "rewards/accuracies": 1.0, "rewards/chosen": 5.343197822570801, "rewards/margins": 2.4666292667388916, "rewards/rejected": 2.876568555831909, "step": 6359 }, { "epoch": 1.41, "learning_rate": 2.1293452295883823e-06, "logits/chosen": -1.8602635860443115, "logits/rejected": -1.8496208190917969, "logps/chosen": -56.451820373535156, "logps/rejected": -53.194271087646484, "loss": 0.1486, "rewards/accuracies": 1.0, "rewards/chosen": 3.8741188049316406, "rewards/margins": 1.1254496574401855, "rewards/rejected": 2.748669147491455, "step": 6360 }, { "epoch": 1.41, "learning_rate": 2.127877921269737e-06, "logits/chosen": -1.9102965593338013, "logits/rejected": -1.8533682823181152, "logps/chosen": -39.60382843017578, "logps/rejected": -57.910606384277344, "loss": 0.0984, "rewards/accuracies": 1.0, "rewards/chosen": 4.12017822265625, "rewards/margins": 1.8174567222595215, "rewards/rejected": 2.3027215003967285, "step": 6361 }, { "epoch": 1.41, "learning_rate": 2.1264109820118783e-06, "logits/chosen": -2.075474739074707, "logits/rejected": -1.8952816724777222, "logps/chosen": -115.73529052734375, "logps/rejected": -24.450044631958008, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 4.802398681640625, "rewards/margins": 4.907547950744629, "rewards/rejected": -0.10514946281909943, "step": 6362 }, { "epoch": 1.41, "learning_rate": 2.124944412003297e-06, "logits/chosen": -1.8126217126846313, "logits/rejected": -1.7967714071273804, "logps/chosen": -42.66228103637695, "logps/rejected": -48.005165100097656, "loss": 1.2526, "rewards/accuracies": 0.0, "rewards/chosen": 4.485787868499756, "rewards/margins": -1.3459696769714355, "rewards/rejected": 5.831757545471191, "step": 6363 }, { "epoch": 1.41, "learning_rate": 2.123478211432451e-06, "logits/chosen": -1.7681660652160645, "logits/rejected": -1.7829294204711914, "logps/chosen": -59.15193557739258, "logps/rejected": -102.916015625, "loss": 0.1672, "rewards/accuracies": 1.0, "rewards/chosen": 6.8959550857543945, "rewards/margins": 1.5886354446411133, "rewards/rejected": 5.307319641113281, "step": 6364 }, { "epoch": 1.41, "learning_rate": 2.1220123804877374e-06, "logits/chosen": -1.7546480894088745, "logits/rejected": -1.7546480894088745, "logps/chosen": -16.56330680847168, "logps/rejected": -16.56330680847168, "loss": 0.5994, "rewards/accuracies": 0.0, "rewards/chosen": 2.2217721939086914, "rewards/margins": 0.0, "rewards/rejected": 2.2217721939086914, "step": 6365 }, { "epoch": 1.41, "learning_rate": 2.120546919357515e-06, "logits/chosen": -2.011993169784546, "logits/rejected": -2.001070499420166, "logps/chosen": -41.7686882019043, "logps/rejected": -62.06597137451172, "loss": 2.0507, "rewards/accuracies": 0.0, "rewards/chosen": 4.706000328063965, "rewards/margins": -2.6879091262817383, "rewards/rejected": 7.393909454345703, "step": 6366 }, { "epoch": 1.41, "learning_rate": 2.1190818282300913e-06, "logits/chosen": -2.031850814819336, "logits/rejected": -2.000450849533081, "logps/chosen": -80.28279876708984, "logps/rejected": -113.12763977050781, "loss": 0.0683, "rewards/accuracies": 1.0, "rewards/chosen": 5.096163272857666, "rewards/margins": 2.5264594554901123, "rewards/rejected": 2.5697038173675537, "step": 6367 }, { "epoch": 1.41, "learning_rate": 2.1176171072937275e-06, "logits/chosen": -1.849963665008545, "logits/rejected": -1.8399611711502075, "logps/chosen": -78.95895385742188, "logps/rejected": -56.36865234375, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": 8.747859001159668, "rewards/margins": 1.9509634971618652, "rewards/rejected": 6.796895503997803, "step": 6368 }, { "epoch": 1.41, "learning_rate": 2.1161527567366373e-06, "logits/chosen": -1.8560055494308472, "logits/rejected": -1.6716710329055786, "logps/chosen": -182.94676208496094, "logps/rejected": -53.789466857910156, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 5.1925950050354, "rewards/margins": 4.176425933837891, "rewards/rejected": 1.0161689519882202, "step": 6369 }, { "epoch": 1.41, "learning_rate": 2.1146887767469825e-06, "logits/chosen": -1.6270748376846313, "logits/rejected": -1.6270748376846313, "logps/chosen": -35.833805084228516, "logps/rejected": -35.833805084228516, "loss": 0.392, "rewards/accuracies": 0.0, "rewards/chosen": 1.592600703239441, "rewards/margins": 0.0, "rewards/rejected": 1.592600703239441, "step": 6370 }, { "epoch": 1.41, "learning_rate": 2.113225167512887e-06, "logits/chosen": -1.8081505298614502, "logits/rejected": -1.8072243928909302, "logps/chosen": -26.4956111907959, "logps/rejected": -28.254165649414062, "loss": 0.357, "rewards/accuracies": 0.0, "rewards/chosen": 2.427323818206787, "rewards/margins": -0.008663415908813477, "rewards/rejected": 2.4359872341156006, "step": 6371 }, { "epoch": 1.41, "learning_rate": 2.1117619292224145e-06, "logits/chosen": -1.9089162349700928, "logits/rejected": -1.886936902999878, "logps/chosen": -56.58382797241211, "logps/rejected": -68.26752471923828, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": 5.590290546417236, "rewards/margins": 3.4025790691375732, "rewards/rejected": 2.187711477279663, "step": 6372 }, { "epoch": 1.41, "learning_rate": 2.1102990620635945e-06, "logits/chosen": -2.1523845195770264, "logits/rejected": -2.1377198696136475, "logps/chosen": -72.33561706542969, "logps/rejected": -36.65916061401367, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 4.167791843414307, "rewards/margins": 4.005788326263428, "rewards/rejected": 0.1620033234357834, "step": 6373 }, { "epoch": 1.41, "learning_rate": 2.1088365662243963e-06, "logits/chosen": -1.9994399547576904, "logits/rejected": -1.9994399547576904, "logps/chosen": -11.568724632263184, "logps/rejected": -11.568724632263184, "loss": 0.3534, "rewards/accuracies": 0.0, "rewards/chosen": 2.635115146636963, "rewards/margins": 0.0, "rewards/rejected": 2.635115146636963, "step": 6374 }, { "epoch": 1.41, "learning_rate": 2.1073744418927493e-06, "logits/chosen": -1.7136821746826172, "logits/rejected": -1.660134196281433, "logps/chosen": -48.03933334350586, "logps/rejected": -46.551734924316406, "loss": 0.1559, "rewards/accuracies": 1.0, "rewards/chosen": 4.219262599945068, "rewards/margins": 1.6243674755096436, "rewards/rejected": 2.594895124435425, "step": 6375 }, { "epoch": 1.41, "learning_rate": 2.105912689256533e-06, "logits/chosen": -1.652669906616211, "logits/rejected": -1.651426076889038, "logps/chosen": -64.75159454345703, "logps/rejected": -88.26008605957031, "loss": 0.8222, "rewards/accuracies": 0.0, "rewards/chosen": 7.585303783416748, "rewards/margins": -1.4285616874694824, "rewards/rejected": 9.01386547088623, "step": 6376 }, { "epoch": 1.41, "learning_rate": 2.1044513085035788e-06, "logits/chosen": -2.087257146835327, "logits/rejected": -2.071993589401245, "logps/chosen": -55.34632110595703, "logps/rejected": -99.32010650634766, "loss": 2.3769, "rewards/accuracies": 0.0, "rewards/chosen": 5.133086681365967, "rewards/margins": -4.5876994132995605, "rewards/rejected": 9.720786094665527, "step": 6377 }, { "epoch": 1.41, "learning_rate": 2.102990299821673e-06, "logits/chosen": -2.256809949874878, "logits/rejected": -2.3156092166900635, "logps/chosen": -147.98672485351562, "logps/rejected": -203.033447265625, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 11.953317642211914, "rewards/margins": 3.0443239212036133, "rewards/rejected": 8.9089937210083, "step": 6378 }, { "epoch": 1.41, "learning_rate": 2.101529663398545e-06, "logits/chosen": -1.8414943218231201, "logits/rejected": -1.8414943218231201, "logps/chosen": -30.076343536376953, "logps/rejected": -30.076343536376953, "loss": 0.3468, "rewards/accuracies": 0.0, "rewards/chosen": 6.239546775817871, "rewards/margins": 0.0, "rewards/rejected": 6.239546775817871, "step": 6379 }, { "epoch": 1.41, "learning_rate": 2.100069399421892e-06, "logits/chosen": -1.9208431243896484, "logits/rejected": -1.8315774202346802, "logps/chosen": -32.070472717285156, "logps/rejected": -46.57982635498047, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": 3.3826904296875, "rewards/margins": 2.759772777557373, "rewards/rejected": 0.6229175925254822, "step": 6380 }, { "epoch": 1.41, "learning_rate": 2.0986095080793457e-06, "logits/chosen": -1.9976156949996948, "logits/rejected": -2.0256974697113037, "logps/chosen": -23.45284652709961, "logps/rejected": -139.52847290039062, "loss": 2.7416, "rewards/accuracies": 0.0, "rewards/chosen": 3.468719244003296, "rewards/margins": -5.4028425216674805, "rewards/rejected": 8.871562004089355, "step": 6381 }, { "epoch": 1.41, "learning_rate": 2.0971499895585066e-06, "logits/chosen": -2.1473729610443115, "logits/rejected": -2.114964723587036, "logps/chosen": -66.23576354980469, "logps/rejected": -106.00401306152344, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 6.713550090789795, "rewards/margins": 3.256688117980957, "rewards/rejected": 3.456861972808838, "step": 6382 }, { "epoch": 1.41, "learning_rate": 2.0956908440469137e-06, "logits/chosen": -1.8260984420776367, "logits/rejected": -1.8701545000076294, "logps/chosen": -51.90989303588867, "logps/rejected": -130.7093505859375, "loss": 0.1742, "rewards/accuracies": 1.0, "rewards/chosen": 7.9623308181762695, "rewards/margins": 0.8923602104187012, "rewards/rejected": 7.069970607757568, "step": 6383 }, { "epoch": 1.41, "learning_rate": 2.0942320717320658e-06, "logits/chosen": -1.7487280368804932, "logits/rejected": -1.7487280368804932, "logps/chosen": -7.873957633972168, "logps/rejected": -7.873957633972168, "loss": 0.3547, "rewards/accuracies": 0.0, "rewards/chosen": 1.3085296154022217, "rewards/margins": 0.0, "rewards/rejected": 1.3085296154022217, "step": 6384 }, { "epoch": 1.41, "learning_rate": 2.0927736728014112e-06, "logits/chosen": -1.9208705425262451, "logits/rejected": -1.9819884300231934, "logps/chosen": -38.92604064941406, "logps/rejected": -90.93975830078125, "loss": 1.3927, "rewards/accuracies": 0.0, "rewards/chosen": 3.1600327491760254, "rewards/margins": -2.701418876647949, "rewards/rejected": 5.861451625823975, "step": 6385 }, { "epoch": 1.41, "learning_rate": 2.0913156474423514e-06, "logits/chosen": -1.62204909324646, "logits/rejected": -1.62204909324646, "logps/chosen": -4.4598822593688965, "logps/rejected": -4.4598822593688965, "loss": 0.4294, "rewards/accuracies": 0.0, "rewards/chosen": 0.815748393535614, "rewards/margins": 0.0, "rewards/rejected": 0.815748393535614, "step": 6386 }, { "epoch": 1.41, "learning_rate": 2.08985799584224e-06, "logits/chosen": -2.1033473014831543, "logits/rejected": -2.078767776489258, "logps/chosen": -32.314029693603516, "logps/rejected": -70.7337875366211, "loss": 0.8464, "rewards/accuracies": 0.0, "rewards/chosen": 4.045711994171143, "rewards/margins": -1.462179183959961, "rewards/rejected": 5.5078911781311035, "step": 6387 }, { "epoch": 1.41, "learning_rate": 2.0884007181883764e-06, "logits/chosen": -1.830613374710083, "logits/rejected": -1.7783746719360352, "logps/chosen": -42.198421478271484, "logps/rejected": -39.82113265991211, "loss": 0.4384, "rewards/accuracies": 1.0, "rewards/chosen": 4.240006923675537, "rewards/margins": 0.25051069259643555, "rewards/rejected": 3.9894962310791016, "step": 6388 }, { "epoch": 1.41, "learning_rate": 2.0869438146680254e-06, "logits/chosen": -1.8903266191482544, "logits/rejected": -1.8714179992675781, "logps/chosen": -32.52348709106445, "logps/rejected": -43.115142822265625, "loss": 0.7868, "rewards/accuracies": 0.0, "rewards/chosen": 4.210529804229736, "rewards/margins": -1.3197660446166992, "rewards/rejected": 5.5302958488464355, "step": 6389 }, { "epoch": 1.41, "learning_rate": 2.0854872854683877e-06, "logits/chosen": -2.0592379570007324, "logits/rejected": -2.014251470565796, "logps/chosen": -82.87239074707031, "logps/rejected": -50.856117248535156, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": 8.553088188171387, "rewards/margins": 2.6685657501220703, "rewards/rejected": 5.884522438049316, "step": 6390 }, { "epoch": 1.41, "learning_rate": 2.084031130776631e-06, "logits/chosen": -2.0728793144226074, "logits/rejected": -2.0992953777313232, "logps/chosen": -51.90248107910156, "logps/rejected": -94.0915756225586, "loss": 0.6214, "rewards/accuracies": 0.0, "rewards/chosen": 3.354384660720825, "rewards/margins": -0.32135772705078125, "rewards/rejected": 3.6757423877716064, "step": 6391 }, { "epoch": 1.41, "learning_rate": 2.0825753507798623e-06, "logits/chosen": -2.049908399581909, "logits/rejected": -1.4130587577819824, "logps/chosen": -56.56305694580078, "logps/rejected": -41.13060760498047, "loss": 0.0788, "rewards/accuracies": 1.0, "rewards/chosen": 7.390960216522217, "rewards/margins": 1.9460563659667969, "rewards/rejected": 5.44490385055542, "step": 6392 }, { "epoch": 1.42, "learning_rate": 2.0811199456651484e-06, "logits/chosen": -1.681968092918396, "logits/rejected": -1.6186269521713257, "logps/chosen": -89.38286590576172, "logps/rejected": -11.690773010253906, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": 3.4849510192871094, "rewards/margins": 2.4979355335235596, "rewards/rejected": 0.9870155453681946, "step": 6393 }, { "epoch": 1.42, "learning_rate": 2.079664915619504e-06, "logits/chosen": -1.8528343439102173, "logits/rejected": -1.837506890296936, "logps/chosen": -53.439910888671875, "logps/rejected": -70.0186996459961, "loss": 0.0706, "rewards/accuracies": 1.0, "rewards/chosen": 4.962192058563232, "rewards/margins": 1.890235424041748, "rewards/rejected": 3.0719566345214844, "step": 6394 }, { "epoch": 1.42, "learning_rate": 2.0782102608298977e-06, "logits/chosen": -1.66051185131073, "logits/rejected": -1.5615499019622803, "logps/chosen": -52.785865783691406, "logps/rejected": -20.045368194580078, "loss": 0.3992, "rewards/accuracies": 1.0, "rewards/chosen": 2.630208730697632, "rewards/margins": 2.1992805004119873, "rewards/rejected": 0.43092823028564453, "step": 6395 }, { "epoch": 1.42, "learning_rate": 2.076755981483249e-06, "logits/chosen": -1.7963100671768188, "logits/rejected": -1.748081088066101, "logps/chosen": -60.86228942871094, "logps/rejected": -24.908218383789062, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": 3.0654473304748535, "rewards/margins": 1.8444634675979614, "rewards/rejected": 1.220983862876892, "step": 6396 }, { "epoch": 1.42, "learning_rate": 2.0753020777664294e-06, "logits/chosen": -2.129429817199707, "logits/rejected": -2.129429817199707, "logps/chosen": -89.6150894165039, "logps/rejected": -89.6150894165039, "loss": 0.3592, "rewards/accuracies": 0.0, "rewards/chosen": 7.083489418029785, "rewards/margins": 0.0, "rewards/rejected": 7.083489418029785, "step": 6397 }, { "epoch": 1.42, "learning_rate": 2.073848549866264e-06, "logits/chosen": -2.132957935333252, "logits/rejected": -2.140061855316162, "logps/chosen": -31.739730834960938, "logps/rejected": -60.048583984375, "loss": 0.2182, "rewards/accuracies": 1.0, "rewards/chosen": 3.059117555618286, "rewards/margins": 1.9991871118545532, "rewards/rejected": 1.059930443763733, "step": 6398 }, { "epoch": 1.42, "learning_rate": 2.0723953979695224e-06, "logits/chosen": -1.8632761240005493, "logits/rejected": -1.7247967720031738, "logps/chosen": -34.98719024658203, "logps/rejected": -46.81938934326172, "loss": 0.4395, "rewards/accuracies": 1.0, "rewards/chosen": 2.685157537460327, "rewards/margins": 0.44106483459472656, "rewards/rejected": 2.2440927028656006, "step": 6399 }, { "epoch": 1.42, "learning_rate": 2.0709426222629374e-06, "logits/chosen": -2.0930051803588867, "logits/rejected": -1.939102053642273, "logps/chosen": -86.26689147949219, "logps/rejected": -8.656651496887207, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 4.346208095550537, "rewards/margins": 3.1582512855529785, "rewards/rejected": 1.1879569292068481, "step": 6400 }, { "epoch": 1.42, "learning_rate": 2.069490222933183e-06, "logits/chosen": -2.1266186237335205, "logits/rejected": -2.1228086948394775, "logps/chosen": -68.65367889404297, "logps/rejected": -70.01480102539062, "loss": 0.2838, "rewards/accuracies": 1.0, "rewards/chosen": 3.915512800216675, "rewards/margins": 0.5078659057617188, "rewards/rejected": 3.407646894454956, "step": 6401 }, { "epoch": 1.42, "learning_rate": 2.0680382001668896e-06, "logits/chosen": -1.860392689704895, "logits/rejected": -1.7868077754974365, "logps/chosen": -103.03695678710938, "logps/rejected": -50.377052307128906, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 6.005807399749756, "rewards/margins": 2.290886640548706, "rewards/rejected": 3.71492075920105, "step": 6402 }, { "epoch": 1.42, "learning_rate": 2.0665865541506397e-06, "logits/chosen": -1.6555365324020386, "logits/rejected": -1.5616145133972168, "logps/chosen": -65.07362365722656, "logps/rejected": -76.25348663330078, "loss": 0.432, "rewards/accuracies": 1.0, "rewards/chosen": 3.6580352783203125, "rewards/margins": 3.176036834716797, "rewards/rejected": 0.4819984436035156, "step": 6403 }, { "epoch": 1.42, "learning_rate": 2.0651352850709656e-06, "logits/chosen": -2.0565311908721924, "logits/rejected": -1.9910165071487427, "logps/chosen": -104.55205535888672, "logps/rejected": -56.65579605102539, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": 5.952335357666016, "rewards/margins": 2.985018491744995, "rewards/rejected": 2.9673168659210205, "step": 6404 }, { "epoch": 1.42, "learning_rate": 2.0636843931143523e-06, "logits/chosen": -2.1273887157440186, "logits/rejected": -2.1437788009643555, "logps/chosen": -31.34059715270996, "logps/rejected": -115.57697296142578, "loss": 0.2703, "rewards/accuracies": 1.0, "rewards/chosen": 3.3753936290740967, "rewards/margins": 0.9521887302398682, "rewards/rejected": 2.4232048988342285, "step": 6405 }, { "epoch": 1.42, "learning_rate": 2.062233878467236e-06, "logits/chosen": -1.8798705339431763, "logits/rejected": -1.8863904476165771, "logps/chosen": -30.186010360717773, "logps/rejected": -60.429359436035156, "loss": 0.2779, "rewards/accuracies": 1.0, "rewards/chosen": 3.852410078048706, "rewards/margins": 0.784111738204956, "rewards/rejected": 3.06829833984375, "step": 6406 }, { "epoch": 1.42, "learning_rate": 2.060783741316005e-06, "logits/chosen": -1.963034749031067, "logits/rejected": -1.967970371246338, "logps/chosen": -73.1241226196289, "logps/rejected": -51.018341064453125, "loss": 0.3553, "rewards/accuracies": 1.0, "rewards/chosen": 4.197606086730957, "rewards/margins": 0.5603432655334473, "rewards/rejected": 3.6372628211975098, "step": 6407 }, { "epoch": 1.42, "learning_rate": 2.0593339818469947e-06, "logits/chosen": -1.9329800605773926, "logits/rejected": -1.958682894706726, "logps/chosen": -53.06422805786133, "logps/rejected": -84.17774200439453, "loss": 0.3461, "rewards/accuracies": 1.0, "rewards/chosen": 8.158212661743164, "rewards/margins": 0.020826339721679688, "rewards/rejected": 8.137386322021484, "step": 6408 }, { "epoch": 1.42, "learning_rate": 2.057884600246502e-06, "logits/chosen": -1.6481280326843262, "logits/rejected": -1.6447391510009766, "logps/chosen": -6.079286575317383, "logps/rejected": -24.94458770751953, "loss": 0.2544, "rewards/accuracies": 1.0, "rewards/chosen": 2.183764696121216, "rewards/margins": 0.4338715076446533, "rewards/rejected": 1.7498931884765625, "step": 6409 }, { "epoch": 1.42, "learning_rate": 2.0564355967007638e-06, "logits/chosen": -1.886892557144165, "logits/rejected": -1.8302394151687622, "logps/chosen": -52.2497673034668, "logps/rejected": -54.64961242675781, "loss": 0.4098, "rewards/accuracies": 0.0, "rewards/chosen": 3.6454739570617676, "rewards/margins": -0.04400897026062012, "rewards/rejected": 3.6894829273223877, "step": 6410 }, { "epoch": 1.42, "learning_rate": 2.0549869713959753e-06, "logits/chosen": -2.1829657554626465, "logits/rejected": -2.1871376037597656, "logps/chosen": -59.542449951171875, "logps/rejected": -55.43510437011719, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": 5.636817455291748, "rewards/margins": 2.6341350078582764, "rewards/rejected": 3.0026824474334717, "step": 6411 }, { "epoch": 1.42, "learning_rate": 2.0535387245182814e-06, "logits/chosen": -1.8005032539367676, "logits/rejected": -1.89425528049469, "logps/chosen": -66.1398696899414, "logps/rejected": -120.21772766113281, "loss": 1.6125, "rewards/accuracies": 0.0, "rewards/chosen": 7.012538909912109, "rewards/margins": -3.162923812866211, "rewards/rejected": 10.17546272277832, "step": 6412 }, { "epoch": 1.42, "learning_rate": 2.0520908562537784e-06, "logits/chosen": -1.8454499244689941, "logits/rejected": -1.7793563604354858, "logps/chosen": -140.104736328125, "logps/rejected": -72.09455871582031, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 6.302862644195557, "rewards/margins": 2.6129074096679688, "rewards/rejected": 3.689955234527588, "step": 6413 }, { "epoch": 1.42, "learning_rate": 2.0506433667885134e-06, "logits/chosen": -1.759399175643921, "logits/rejected": -1.759399175643921, "logps/chosen": -14.790645599365234, "logps/rejected": -14.790645599365234, "loss": 0.3467, "rewards/accuracies": 0.0, "rewards/chosen": 2.3823201656341553, "rewards/margins": 0.0, "rewards/rejected": 2.3823201656341553, "step": 6414 }, { "epoch": 1.42, "learning_rate": 2.049196256308486e-06, "logits/chosen": -1.9622999429702759, "logits/rejected": -1.9038267135620117, "logps/chosen": -105.50289154052734, "logps/rejected": -67.67879486083984, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": 7.704004764556885, "rewards/margins": 3.866481065750122, "rewards/rejected": 3.8375236988067627, "step": 6415 }, { "epoch": 1.42, "learning_rate": 2.047749524999646e-06, "logits/chosen": -2.1261789798736572, "logits/rejected": -2.0530920028686523, "logps/chosen": -80.95037078857422, "logps/rejected": -11.94113540649414, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 4.790940284729004, "rewards/margins": 3.5447185039520264, "rewards/rejected": 1.2462217807769775, "step": 6416 }, { "epoch": 1.42, "learning_rate": 2.0463031730478965e-06, "logits/chosen": -1.6712342500686646, "logits/rejected": -1.6486165523529053, "logps/chosen": -54.59626007080078, "logps/rejected": -94.48190307617188, "loss": 0.1774, "rewards/accuracies": 1.0, "rewards/chosen": 5.11455774307251, "rewards/margins": 1.3684577941894531, "rewards/rejected": 3.7460999488830566, "step": 6417 }, { "epoch": 1.42, "learning_rate": 2.0448572006390875e-06, "logits/chosen": -1.8373059034347534, "logits/rejected": -1.8151198625564575, "logps/chosen": -83.19320678710938, "logps/rejected": -62.925636291503906, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": 6.140615940093994, "rewards/margins": 2.9273438453674316, "rewards/rejected": 3.2132720947265625, "step": 6418 }, { "epoch": 1.42, "learning_rate": 2.0434116079590243e-06, "logits/chosen": -2.166731119155884, "logits/rejected": -2.0842835903167725, "logps/chosen": -92.39028930664062, "logps/rejected": -199.52789306640625, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": 10.572073936462402, "rewards/margins": 2.6863131523132324, "rewards/rejected": 7.88576078414917, "step": 6419 }, { "epoch": 1.42, "learning_rate": 2.041966395193462e-06, "logits/chosen": -1.8190898895263672, "logits/rejected": -1.8602879047393799, "logps/chosen": -20.794689178466797, "logps/rejected": -62.77302169799805, "loss": 0.7802, "rewards/accuracies": 0.0, "rewards/chosen": 3.034839391708374, "rewards/margins": -0.9335188865661621, "rewards/rejected": 3.968358278274536, "step": 6420 }, { "epoch": 1.42, "learning_rate": 2.0405215625281067e-06, "logits/chosen": -1.8779270648956299, "logits/rejected": -1.894822359085083, "logps/chosen": -43.78841018676758, "logps/rejected": -73.81295013427734, "loss": 0.2777, "rewards/accuracies": 1.0, "rewards/chosen": 3.7534711360931396, "rewards/margins": 0.6813099384307861, "rewards/rejected": 3.0721611976623535, "step": 6421 }, { "epoch": 1.42, "learning_rate": 2.039077110148616e-06, "logits/chosen": -2.2126970291137695, "logits/rejected": -2.2026662826538086, "logps/chosen": -36.377235412597656, "logps/rejected": -73.92842102050781, "loss": 0.5989, "rewards/accuracies": 1.0, "rewards/chosen": 3.7208945751190186, "rewards/margins": 0.7167074680328369, "rewards/rejected": 3.0041871070861816, "step": 6422 }, { "epoch": 1.42, "learning_rate": 2.0376330382405997e-06, "logits/chosen": -2.1312148571014404, "logits/rejected": -2.099112033843994, "logps/chosen": -82.27949523925781, "logps/rejected": -134.77108764648438, "loss": 0.0567, "rewards/accuracies": 1.0, "rewards/chosen": 8.531349182128906, "rewards/margins": 2.4680066108703613, "rewards/rejected": 6.063342571258545, "step": 6423 }, { "epoch": 1.42, "learning_rate": 2.036189346989616e-06, "logits/chosen": -1.8399136066436768, "logits/rejected": -1.8399136066436768, "logps/chosen": -47.47789764404297, "logps/rejected": -47.47789764404297, "loss": 0.4, "rewards/accuracies": 0.0, "rewards/chosen": 6.539322853088379, "rewards/margins": 0.0, "rewards/rejected": 6.539322853088379, "step": 6424 }, { "epoch": 1.42, "learning_rate": 2.034746036581176e-06, "logits/chosen": -1.8764011859893799, "logits/rejected": -1.4652765989303589, "logps/chosen": -91.48764038085938, "logps/rejected": -99.02864074707031, "loss": 0.1262, "rewards/accuracies": 1.0, "rewards/chosen": 6.532820224761963, "rewards/margins": 1.7219209671020508, "rewards/rejected": 4.810899257659912, "step": 6425 }, { "epoch": 1.42, "learning_rate": 2.0333031072007447e-06, "logits/chosen": -1.9099270105361938, "logits/rejected": -1.9010788202285767, "logps/chosen": -31.605998992919922, "logps/rejected": -44.968971252441406, "loss": 0.2835, "rewards/accuracies": 1.0, "rewards/chosen": 3.2910542488098145, "rewards/margins": 0.28244829177856445, "rewards/rejected": 3.00860595703125, "step": 6426 }, { "epoch": 1.42, "learning_rate": 2.031860559033728e-06, "logits/chosen": -1.6450361013412476, "logits/rejected": -1.784020185470581, "logps/chosen": -62.244529724121094, "logps/rejected": -99.12921142578125, "loss": 1.3269, "rewards/accuracies": 0.0, "rewards/chosen": 5.923511028289795, "rewards/margins": -2.425245761871338, "rewards/rejected": 8.348756790161133, "step": 6427 }, { "epoch": 1.42, "learning_rate": 2.0304183922654985e-06, "logits/chosen": -2.0051865577697754, "logits/rejected": -1.9944194555282593, "logps/chosen": -47.07855987548828, "logps/rejected": -86.33528137207031, "loss": 0.1864, "rewards/accuracies": 1.0, "rewards/chosen": 4.12063455581665, "rewards/margins": 0.8096001148223877, "rewards/rejected": 3.3110344409942627, "step": 6428 }, { "epoch": 1.42, "learning_rate": 2.028976607081365e-06, "logits/chosen": -2.008219003677368, "logits/rejected": -2.0759224891662598, "logps/chosen": -128.67591857910156, "logps/rejected": -84.77075958251953, "loss": 1.5692, "rewards/accuracies": 0.0, "rewards/chosen": 4.650399684906006, "rewards/margins": -3.0127434730529785, "rewards/rejected": 7.663143157958984, "step": 6429 }, { "epoch": 1.42, "learning_rate": 2.027535203666595e-06, "logits/chosen": -2.1547927856445312, "logits/rejected": -2.1093289852142334, "logps/chosen": -76.65151977539062, "logps/rejected": -59.88957977294922, "loss": 0.1521, "rewards/accuracies": 1.0, "rewards/chosen": 8.167914390563965, "rewards/margins": 4.148252010345459, "rewards/rejected": 4.019662380218506, "step": 6430 }, { "epoch": 1.42, "learning_rate": 2.026094182206406e-06, "logits/chosen": -1.381574273109436, "logits/rejected": -1.381574273109436, "logps/chosen": -11.250904083251953, "logps/rejected": -11.250904083251953, "loss": 0.517, "rewards/accuracies": 0.0, "rewards/chosen": 0.4168451428413391, "rewards/margins": 0.0, "rewards/rejected": 0.4168451428413391, "step": 6431 }, { "epoch": 1.42, "learning_rate": 2.0246535428859652e-06, "logits/chosen": -2.081174373626709, "logits/rejected": -2.1283352375030518, "logps/chosen": -109.39418029785156, "logps/rejected": -188.73751831054688, "loss": 0.1003, "rewards/accuracies": 1.0, "rewards/chosen": 10.717801094055176, "rewards/margins": 1.50958251953125, "rewards/rejected": 9.208218574523926, "step": 6432 }, { "epoch": 1.42, "learning_rate": 2.0232132858903923e-06, "logits/chosen": -2.4007411003112793, "logits/rejected": -2.395341634750366, "logps/chosen": -38.77272033691406, "logps/rejected": -12.100622177124023, "loss": 0.8278, "rewards/accuracies": 0.0, "rewards/chosen": 2.024836301803589, "rewards/margins": -0.9112720489501953, "rewards/rejected": 2.936108350753784, "step": 6433 }, { "epoch": 1.42, "learning_rate": 2.021773411404756e-06, "logits/chosen": -1.8503731489181519, "logits/rejected": -1.8537769317626953, "logps/chosen": -9.772811889648438, "logps/rejected": -9.195780754089355, "loss": 0.4652, "rewards/accuracies": 0.0, "rewards/chosen": 1.5207958221435547, "rewards/margins": -0.1702415943145752, "rewards/rejected": 1.6910374164581299, "step": 6434 }, { "epoch": 1.42, "learning_rate": 2.020333919614078e-06, "logits/chosen": -1.976313591003418, "logits/rejected": -1.9679690599441528, "logps/chosen": -38.27000427246094, "logps/rejected": -66.65233612060547, "loss": 1.0932, "rewards/accuracies": 0.0, "rewards/chosen": 2.7998154163360596, "rewards/margins": -1.897972822189331, "rewards/rejected": 4.697788238525391, "step": 6435 }, { "epoch": 1.42, "learning_rate": 2.018894810703325e-06, "logits/chosen": -1.8075075149536133, "logits/rejected": -1.8462847471237183, "logps/chosen": -16.77273941040039, "logps/rejected": -36.00536346435547, "loss": 0.765, "rewards/accuracies": 0.0, "rewards/chosen": 2.4225685596466064, "rewards/margins": -1.2650847434997559, "rewards/rejected": 3.6876533031463623, "step": 6436 }, { "epoch": 1.42, "learning_rate": 2.0174560848574266e-06, "logits/chosen": -2.055016040802002, "logits/rejected": -2.0108416080474854, "logps/chosen": -159.31532287597656, "logps/rejected": -70.57081604003906, "loss": 0.0802, "rewards/accuracies": 1.0, "rewards/chosen": 7.7824907302856445, "rewards/margins": 1.9313616752624512, "rewards/rejected": 5.851129055023193, "step": 6437 }, { "epoch": 1.42, "learning_rate": 2.016017742261249e-06, "logits/chosen": -1.9092592000961304, "logits/rejected": -1.878002405166626, "logps/chosen": -128.06982421875, "logps/rejected": -64.7382583618164, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 9.333772659301758, "rewards/margins": 3.90963077545166, "rewards/rejected": 5.424141883850098, "step": 6438 }, { "epoch": 1.43, "learning_rate": 2.014579783099619e-06, "logits/chosen": -1.9081271886825562, "logits/rejected": -1.8900378942489624, "logps/chosen": -84.38169860839844, "logps/rejected": -65.58977508544922, "loss": 0.0775, "rewards/accuracies": 1.0, "rewards/chosen": 9.100723266601562, "rewards/margins": 1.843498706817627, "rewards/rejected": 7.2572245597839355, "step": 6439 }, { "epoch": 1.43, "learning_rate": 2.0131422075573097e-06, "logits/chosen": -1.5847822427749634, "logits/rejected": -1.528926968574524, "logps/chosen": -108.15657043457031, "logps/rejected": -31.388229370117188, "loss": 0.1708, "rewards/accuracies": 1.0, "rewards/chosen": 5.681910991668701, "rewards/margins": 2.856459617614746, "rewards/rejected": 2.825451374053955, "step": 6440 }, { "epoch": 1.43, "learning_rate": 2.0117050158190464e-06, "logits/chosen": -1.6897908449172974, "logits/rejected": -1.711430311203003, "logps/chosen": -25.644176483154297, "logps/rejected": -62.001869201660156, "loss": 0.395, "rewards/accuracies": 0.0, "rewards/chosen": 5.217606067657471, "rewards/margins": -0.1504812240600586, "rewards/rejected": 5.368087291717529, "step": 6441 }, { "epoch": 1.43, "learning_rate": 2.0102682080695053e-06, "logits/chosen": -1.9531176090240479, "logits/rejected": -1.9586936235427856, "logps/chosen": -42.620216369628906, "logps/rejected": -76.48780822753906, "loss": 0.9069, "rewards/accuracies": 0.0, "rewards/chosen": 6.424249172210693, "rewards/margins": -1.5224061012268066, "rewards/rejected": 7.9466552734375, "step": 6442 }, { "epoch": 1.43, "learning_rate": 2.008831784493312e-06, "logits/chosen": -1.907421588897705, "logits/rejected": -1.869674563407898, "logps/chosen": -33.999847412109375, "logps/rejected": -64.9092788696289, "loss": 0.5516, "rewards/accuracies": 0.0, "rewards/chosen": 3.5422394275665283, "rewards/margins": -0.6977694034576416, "rewards/rejected": 4.24000883102417, "step": 6443 }, { "epoch": 1.43, "learning_rate": 2.0073957452750453e-06, "logits/chosen": -1.6817243099212646, "logits/rejected": -1.6817243099212646, "logps/chosen": -48.378726959228516, "logps/rejected": -48.378726959228516, "loss": 0.3555, "rewards/accuracies": 0.0, "rewards/chosen": 1.9979534149169922, "rewards/margins": 0.0, "rewards/rejected": 1.9979534149169922, "step": 6444 }, { "epoch": 1.43, "learning_rate": 2.005960090599228e-06, "logits/chosen": -1.6862432956695557, "logits/rejected": -1.615555763244629, "logps/chosen": -33.237125396728516, "logps/rejected": -20.24135398864746, "loss": 0.3019, "rewards/accuracies": 1.0, "rewards/chosen": 3.5329463481903076, "rewards/margins": 1.9881230592727661, "rewards/rejected": 1.5448232889175415, "step": 6445 }, { "epoch": 1.43, "learning_rate": 2.0045248206503454e-06, "logits/chosen": -1.805822491645813, "logits/rejected": -1.7165619134902954, "logps/chosen": -47.02153015136719, "logps/rejected": -12.043176651000977, "loss": 0.1395, "rewards/accuracies": 1.0, "rewards/chosen": 2.6095688343048096, "rewards/margins": 1.5937374830245972, "rewards/rejected": 1.0158313512802124, "step": 6446 }, { "epoch": 1.43, "learning_rate": 2.0030899356128185e-06, "logits/chosen": -1.7380256652832031, "logits/rejected": -1.773869276046753, "logps/chosen": -24.29090690612793, "logps/rejected": -89.24668884277344, "loss": 0.2386, "rewards/accuracies": 1.0, "rewards/chosen": 3.1637396812438965, "rewards/margins": 0.5080344676971436, "rewards/rejected": 2.655705213546753, "step": 6447 }, { "epoch": 1.43, "learning_rate": 2.0016554356710344e-06, "logits/chosen": -1.8850908279418945, "logits/rejected": -1.9087562561035156, "logps/chosen": -34.92645263671875, "logps/rejected": -43.86756134033203, "loss": 0.3062, "rewards/accuracies": 1.0, "rewards/chosen": 4.024142742156982, "rewards/margins": 0.18823790550231934, "rewards/rejected": 3.835904836654663, "step": 6448 }, { "epoch": 1.43, "learning_rate": 2.0002213210093174e-06, "logits/chosen": -2.167585849761963, "logits/rejected": -2.188800811767578, "logps/chosen": -41.080421447753906, "logps/rejected": -59.054718017578125, "loss": 0.2883, "rewards/accuracies": 1.0, "rewards/chosen": 4.184960842132568, "rewards/margins": 0.2781660556793213, "rewards/rejected": 3.906794786453247, "step": 6449 }, { "epoch": 1.43, "learning_rate": 1.9987875918119486e-06, "logits/chosen": -2.032691240310669, "logits/rejected": -2.046985387802124, "logps/chosen": -68.3193588256836, "logps/rejected": -83.41331481933594, "loss": 0.3744, "rewards/accuracies": 1.0, "rewards/chosen": 7.191948890686035, "rewards/margins": 0.9160013198852539, "rewards/rejected": 6.275947570800781, "step": 6450 }, { "epoch": 1.43, "learning_rate": 1.99735424826316e-06, "logits/chosen": -1.748146414756775, "logits/rejected": -1.7404422760009766, "logps/chosen": -40.708251953125, "logps/rejected": -38.050025939941406, "loss": 0.6664, "rewards/accuracies": 0.0, "rewards/chosen": 3.0927841663360596, "rewards/margins": -0.06998372077941895, "rewards/rejected": 3.1627678871154785, "step": 6451 }, { "epoch": 1.43, "learning_rate": 1.9959212905471326e-06, "logits/chosen": -2.2128052711486816, "logits/rejected": -2.17181658744812, "logps/chosen": -67.71681213378906, "logps/rejected": -27.05474281311035, "loss": 0.0839, "rewards/accuracies": 1.0, "rewards/chosen": 3.7726457118988037, "rewards/margins": 1.7970645427703857, "rewards/rejected": 1.975581169128418, "step": 6452 }, { "epoch": 1.43, "learning_rate": 1.9944887188479995e-06, "logits/chosen": -1.9889791011810303, "logits/rejected": -1.983378291130066, "logps/chosen": -58.26172637939453, "logps/rejected": -96.64486694335938, "loss": 0.2028, "rewards/accuracies": 1.0, "rewards/chosen": 7.519994258880615, "rewards/margins": 0.8310627937316895, "rewards/rejected": 6.688931465148926, "step": 6453 }, { "epoch": 1.43, "learning_rate": 1.993056533349837e-06, "logits/chosen": -2.0804390907287598, "logits/rejected": -2.0148725509643555, "logps/chosen": -54.87778091430664, "logps/rejected": -57.56580352783203, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": 5.379967212677002, "rewards/margins": 2.5367207527160645, "rewards/rejected": 2.8432464599609375, "step": 6454 }, { "epoch": 1.43, "learning_rate": 1.991624734236685e-06, "logits/chosen": -1.7873048782348633, "logits/rejected": -1.7198452949523926, "logps/chosen": -78.03480529785156, "logps/rejected": -70.62065887451172, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": 7.260158061981201, "rewards/margins": 2.2423272132873535, "rewards/rejected": 5.017830848693848, "step": 6455 }, { "epoch": 1.43, "learning_rate": 1.9901933216925196e-06, "logits/chosen": -1.7553727626800537, "logits/rejected": -1.7793000936508179, "logps/chosen": -63.124412536621094, "logps/rejected": -96.50829315185547, "loss": 0.4487, "rewards/accuracies": 0.0, "rewards/chosen": 5.43835973739624, "rewards/margins": -0.21148252487182617, "rewards/rejected": 5.649842262268066, "step": 6456 }, { "epoch": 1.43, "learning_rate": 1.9887622959012804e-06, "logits/chosen": -2.0433125495910645, "logits/rejected": -1.9706575870513916, "logps/chosen": -36.50722885131836, "logps/rejected": -18.6268310546875, "loss": 0.172, "rewards/accuracies": 1.0, "rewards/chosen": 3.127014636993408, "rewards/margins": 1.5235973596572876, "rewards/rejected": 1.6034172773361206, "step": 6457 }, { "epoch": 1.43, "learning_rate": 1.9873316570468456e-06, "logits/chosen": -2.1063997745513916, "logits/rejected": -2.0690383911132812, "logps/chosen": -32.81248474121094, "logps/rejected": -20.636947631835938, "loss": 0.3091, "rewards/accuracies": 1.0, "rewards/chosen": 3.2455475330352783, "rewards/margins": 1.007969617843628, "rewards/rejected": 2.2375779151916504, "step": 6458 }, { "epoch": 1.43, "learning_rate": 1.9859014053130505e-06, "logits/chosen": -2.0703322887420654, "logits/rejected": -2.046082019805908, "logps/chosen": -90.95278930664062, "logps/rejected": -80.06501770019531, "loss": 0.2025, "rewards/accuracies": 1.0, "rewards/chosen": 5.9920501708984375, "rewards/margins": 4.538186073303223, "rewards/rejected": 1.453864336013794, "step": 6459 }, { "epoch": 1.43, "learning_rate": 1.984471540883679e-06, "logits/chosen": -1.9596807956695557, "logits/rejected": -1.9239792823791504, "logps/chosen": -82.79826354980469, "logps/rejected": -116.000244140625, "loss": 0.1069, "rewards/accuracies": 1.0, "rewards/chosen": 6.354544162750244, "rewards/margins": 1.4921965599060059, "rewards/rejected": 4.862347602844238, "step": 6460 }, { "epoch": 1.43, "learning_rate": 1.9830420639424653e-06, "logits/chosen": -1.7995951175689697, "logits/rejected": -1.7978124618530273, "logps/chosen": -37.1277961730957, "logps/rejected": -57.30024719238281, "loss": 0.2812, "rewards/accuracies": 1.0, "rewards/chosen": 4.481657028198242, "rewards/margins": 0.37865638732910156, "rewards/rejected": 4.103000640869141, "step": 6461 }, { "epoch": 1.43, "learning_rate": 1.981612974673096e-06, "logits/chosen": -1.8790695667266846, "logits/rejected": -1.897255539894104, "logps/chosen": -78.67253112792969, "logps/rejected": -111.19285583496094, "loss": 0.1249, "rewards/accuracies": 1.0, "rewards/chosen": 7.5100603103637695, "rewards/margins": 1.3503146171569824, "rewards/rejected": 6.159745693206787, "step": 6462 }, { "epoch": 1.43, "learning_rate": 1.9801842732592e-06, "logits/chosen": -1.6944139003753662, "logits/rejected": -1.7046490907669067, "logps/chosen": -40.553611755371094, "logps/rejected": -89.64762878417969, "loss": 0.7916, "rewards/accuracies": 0.0, "rewards/chosen": 4.922402381896973, "rewards/margins": -1.3467869758605957, "rewards/rejected": 6.269189357757568, "step": 6463 }, { "epoch": 1.43, "learning_rate": 1.9787559598843685e-06, "logits/chosen": -1.734418272972107, "logits/rejected": -1.734418272972107, "logps/chosen": -10.50190258026123, "logps/rejected": -10.50190258026123, "loss": 0.5607, "rewards/accuracies": 0.0, "rewards/chosen": 3.0450100898742676, "rewards/margins": 0.0, "rewards/rejected": 3.0450100898742676, "step": 6464 }, { "epoch": 1.43, "learning_rate": 1.97732803473213e-06, "logits/chosen": -1.9818625450134277, "logits/rejected": -1.9986358880996704, "logps/chosen": -36.38842010498047, "logps/rejected": -93.46102905273438, "loss": 2.7518, "rewards/accuracies": 0.0, "rewards/chosen": 5.26070499420166, "rewards/margins": -5.011282920837402, "rewards/rejected": 10.271987915039062, "step": 6465 }, { "epoch": 1.43, "learning_rate": 1.9759004979859764e-06, "logits/chosen": -1.7568079233169556, "logits/rejected": -1.7534840106964111, "logps/chosen": -102.94279479980469, "logps/rejected": -64.52752685546875, "loss": 0.0414, "rewards/accuracies": 1.0, "rewards/chosen": 6.141975402832031, "rewards/margins": 2.4769134521484375, "rewards/rejected": 3.6650619506835938, "step": 6466 }, { "epoch": 1.43, "learning_rate": 1.974473349829337e-06, "logits/chosen": -1.9911086559295654, "logits/rejected": -1.8936179876327515, "logps/chosen": -85.20158386230469, "logps/rejected": -15.489761352539062, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 5.380427837371826, "rewards/margins": 4.937963962554932, "rewards/rejected": 0.44246408343315125, "step": 6467 }, { "epoch": 1.43, "learning_rate": 1.9730465904455993e-06, "logits/chosen": -2.0145204067230225, "logits/rejected": -1.9711902141571045, "logps/chosen": -138.78565979003906, "logps/rejected": -72.55935668945312, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": 6.721684455871582, "rewards/margins": 2.8687937259674072, "rewards/rejected": 3.852890729904175, "step": 6468 }, { "epoch": 1.43, "learning_rate": 1.9716202200180994e-06, "logits/chosen": -1.7835232019424438, "logits/rejected": -1.6467257738113403, "logps/chosen": -56.64655303955078, "logps/rejected": -60.78205871582031, "loss": 0.128, "rewards/accuracies": 1.0, "rewards/chosen": 5.111390113830566, "rewards/margins": 1.2514917850494385, "rewards/rejected": 3.859898328781128, "step": 6469 }, { "epoch": 1.43, "learning_rate": 1.970194238730118e-06, "logits/chosen": -1.8749624490737915, "logits/rejected": -1.7441655397415161, "logps/chosen": -121.98662567138672, "logps/rejected": -24.58993148803711, "loss": 0.0871, "rewards/accuracies": 1.0, "rewards/chosen": 6.817115306854248, "rewards/margins": 4.267621994018555, "rewards/rejected": 2.5494930744171143, "step": 6470 }, { "epoch": 1.43, "learning_rate": 1.968768646764897e-06, "logits/chosen": -1.842704176902771, "logits/rejected": -1.805312156677246, "logps/chosen": -150.35459899902344, "logps/rejected": -144.61111450195312, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": 7.432774543762207, "rewards/margins": 3.1790618896484375, "rewards/rejected": 4.2537126541137695, "step": 6471 }, { "epoch": 1.43, "learning_rate": 1.9673434443056144e-06, "logits/chosen": -1.9186017513275146, "logits/rejected": -1.9063957929611206, "logps/chosen": -52.487762451171875, "logps/rejected": -71.6585693359375, "loss": 0.1906, "rewards/accuracies": 1.0, "rewards/chosen": 3.3247604370117188, "rewards/margins": 0.9633314609527588, "rewards/rejected": 2.36142897605896, "step": 6472 }, { "epoch": 1.43, "learning_rate": 1.965918631535412e-06, "logits/chosen": -1.7412683963775635, "logits/rejected": -1.738222599029541, "logps/chosen": -53.89143371582031, "logps/rejected": -58.28776168823242, "loss": 1.3881, "rewards/accuracies": 0.0, "rewards/chosen": 3.865147352218628, "rewards/margins": -1.8350484371185303, "rewards/rejected": 5.700195789337158, "step": 6473 }, { "epoch": 1.43, "learning_rate": 1.964494208637369e-06, "logits/chosen": -1.915495753288269, "logits/rejected": -1.7725193500518799, "logps/chosen": -99.47949981689453, "logps/rejected": -22.072839736938477, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": 8.318105697631836, "rewards/margins": 7.053290367126465, "rewards/rejected": 1.264815330505371, "step": 6474 }, { "epoch": 1.43, "learning_rate": 1.9630701757945264e-06, "logits/chosen": -1.7093853950500488, "logits/rejected": -1.6081286668777466, "logps/chosen": -16.79793357849121, "logps/rejected": -1.9593321084976196, "loss": 1.9322, "rewards/accuracies": 1.0, "rewards/chosen": 1.8137552738189697, "rewards/margins": 1.0969781875610352, "rewards/rejected": 0.7167770266532898, "step": 6475 }, { "epoch": 1.43, "learning_rate": 1.961646533189864e-06, "logits/chosen": -1.719435453414917, "logits/rejected": -1.6461704969406128, "logps/chosen": -94.71099090576172, "logps/rejected": -105.9484634399414, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": 6.907681941986084, "rewards/margins": 3.789097547531128, "rewards/rejected": 3.118584394454956, "step": 6476 }, { "epoch": 1.43, "learning_rate": 1.9602232810063175e-06, "logits/chosen": -1.8024427890777588, "logits/rejected": -1.8146544694900513, "logps/chosen": -16.128135681152344, "logps/rejected": -75.173828125, "loss": 0.9496, "rewards/accuracies": 0.0, "rewards/chosen": 3.539518117904663, "rewards/margins": -1.7328054904937744, "rewards/rejected": 5.2723236083984375, "step": 6477 }, { "epoch": 1.43, "learning_rate": 1.958800419426772e-06, "logits/chosen": -2.218336343765259, "logits/rejected": -2.1819722652435303, "logps/chosen": -72.97798156738281, "logps/rejected": -128.85879516601562, "loss": 0.756, "rewards/accuracies": 0.0, "rewards/chosen": 7.816285610198975, "rewards/margins": -1.1720080375671387, "rewards/rejected": 8.988293647766113, "step": 6478 }, { "epoch": 1.43, "learning_rate": 1.9573779486340623e-06, "logits/chosen": -2.1177773475646973, "logits/rejected": -2.0779647827148438, "logps/chosen": -69.73658752441406, "logps/rejected": -62.5548095703125, "loss": 0.1282, "rewards/accuracies": 1.0, "rewards/chosen": 7.141638278961182, "rewards/margins": 3.9822678565979004, "rewards/rejected": 3.1593704223632812, "step": 6479 }, { "epoch": 1.43, "learning_rate": 1.9559558688109735e-06, "logits/chosen": -1.8342182636260986, "logits/rejected": -1.6074119806289673, "logps/chosen": -98.01199340820312, "logps/rejected": -23.052337646484375, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 6.832713603973389, "rewards/margins": 5.651002407073975, "rewards/rejected": 1.181711196899414, "step": 6480 }, { "epoch": 1.43, "learning_rate": 1.9545341801402343e-06, "logits/chosen": -1.9160809516906738, "logits/rejected": -1.869483232498169, "logps/chosen": -79.69611358642578, "logps/rejected": -94.80891418457031, "loss": 0.1651, "rewards/accuracies": 1.0, "rewards/chosen": 9.093567848205566, "rewards/margins": 1.2801446914672852, "rewards/rejected": 7.813423156738281, "step": 6481 }, { "epoch": 1.43, "learning_rate": 1.9531128828045355e-06, "logits/chosen": -2.181481122970581, "logits/rejected": -2.2099545001983643, "logps/chosen": -27.66588592529297, "logps/rejected": -61.35647201538086, "loss": 2.0432, "rewards/accuracies": 0.0, "rewards/chosen": 5.219371318817139, "rewards/margins": -3.7586865425109863, "rewards/rejected": 8.978057861328125, "step": 6482 }, { "epoch": 1.43, "learning_rate": 1.951691976986503e-06, "logits/chosen": -1.7332322597503662, "logits/rejected": -1.7332322597503662, "logps/chosen": -12.092432022094727, "logps/rejected": -12.092432022094727, "loss": 0.727, "rewards/accuracies": 0.0, "rewards/chosen": 1.8079811334609985, "rewards/margins": 0.0, "rewards/rejected": 1.8079811334609985, "step": 6483 }, { "epoch": 1.44, "learning_rate": 1.9502714628687274e-06, "logits/chosen": -1.9217039346694946, "logits/rejected": -1.9198815822601318, "logps/chosen": -53.610923767089844, "logps/rejected": -31.23845863342285, "loss": 0.296, "rewards/accuracies": 1.0, "rewards/chosen": 3.4322402477264404, "rewards/margins": 0.22459697723388672, "rewards/rejected": 3.2076432704925537, "step": 6484 }, { "epoch": 1.44, "learning_rate": 1.948851340633735e-06, "logits/chosen": -1.8965097665786743, "logits/rejected": -1.8717715740203857, "logps/chosen": -43.569190979003906, "logps/rejected": -50.06190490722656, "loss": 0.2441, "rewards/accuracies": 1.0, "rewards/chosen": 3.223789930343628, "rewards/margins": 1.2276793718338013, "rewards/rejected": 1.9961105585098267, "step": 6485 }, { "epoch": 1.44, "learning_rate": 1.9474316104640113e-06, "logits/chosen": -2.093392848968506, "logits/rejected": -2.0946784019470215, "logps/chosen": -61.878700256347656, "logps/rejected": -46.08216094970703, "loss": 0.2619, "rewards/accuracies": 1.0, "rewards/chosen": 4.701833248138428, "rewards/margins": 0.6032943725585938, "rewards/rejected": 4.098538875579834, "step": 6486 }, { "epoch": 1.44, "learning_rate": 1.9460122725419873e-06, "logits/chosen": -1.9578717947006226, "logits/rejected": -1.9472732543945312, "logps/chosen": -46.372581481933594, "logps/rejected": -49.56269073486328, "loss": 0.1077, "rewards/accuracies": 1.0, "rewards/chosen": 5.410404205322266, "rewards/margins": 1.4489777088165283, "rewards/rejected": 3.9614264965057373, "step": 6487 }, { "epoch": 1.44, "learning_rate": 1.9445933270500444e-06, "logits/chosen": -1.979216456413269, "logits/rejected": -1.9840489625930786, "logps/chosen": -38.419456481933594, "logps/rejected": -85.11625671386719, "loss": 0.2156, "rewards/accuracies": 1.0, "rewards/chosen": 4.237498760223389, "rewards/margins": 0.6249117851257324, "rewards/rejected": 3.6125869750976562, "step": 6488 }, { "epoch": 1.44, "learning_rate": 1.943174774170516e-06, "logits/chosen": -2.162194013595581, "logits/rejected": -2.1662697792053223, "logps/chosen": -41.88047790527344, "logps/rejected": -71.03789520263672, "loss": 0.157, "rewards/accuracies": 1.0, "rewards/chosen": 4.1214280128479, "rewards/margins": 1.0047187805175781, "rewards/rejected": 3.1167092323303223, "step": 6489 }, { "epoch": 1.44, "learning_rate": 1.9417566140856775e-06, "logits/chosen": -2.003941297531128, "logits/rejected": -2.003941297531128, "logps/chosen": -43.96839141845703, "logps/rejected": -43.96839141845703, "loss": 0.3999, "rewards/accuracies": 0.0, "rewards/chosen": 3.2359352111816406, "rewards/margins": 0.0, "rewards/rejected": 3.2359352111816406, "step": 6490 }, { "epoch": 1.44, "learning_rate": 1.9403388469777652e-06, "logits/chosen": -2.029019355773926, "logits/rejected": -2.0434892177581787, "logps/chosen": -30.136499404907227, "logps/rejected": -107.5498046875, "loss": 0.6545, "rewards/accuracies": 0.0, "rewards/chosen": 4.530776500701904, "rewards/margins": -0.9269413948059082, "rewards/rejected": 5.4577178955078125, "step": 6491 }, { "epoch": 1.44, "learning_rate": 1.938921473028954e-06, "logits/chosen": -1.8481765985488892, "logits/rejected": -1.8481765985488892, "logps/chosen": -15.789617538452148, "logps/rejected": -15.789617538452148, "loss": 0.5303, "rewards/accuracies": 0.0, "rewards/chosen": 5.004248142242432, "rewards/margins": 0.0, "rewards/rejected": 5.004248142242432, "step": 6492 }, { "epoch": 1.44, "learning_rate": 1.9375044924213755e-06, "logits/chosen": -2.05301570892334, "logits/rejected": -2.0392987728118896, "logps/chosen": -34.217376708984375, "logps/rejected": -46.99166488647461, "loss": 0.2789, "rewards/accuracies": 1.0, "rewards/chosen": 3.971147298812866, "rewards/margins": 0.4694211483001709, "rewards/rejected": 3.5017261505126953, "step": 6493 }, { "epoch": 1.44, "learning_rate": 1.9360879053371073e-06, "logits/chosen": -1.9197988510131836, "logits/rejected": -2.008998394012451, "logps/chosen": -39.971229553222656, "logps/rejected": -102.5981674194336, "loss": 0.3608, "rewards/accuracies": 1.0, "rewards/chosen": 7.807933807373047, "rewards/margins": 0.03990793228149414, "rewards/rejected": 7.768025875091553, "step": 6494 }, { "epoch": 1.44, "learning_rate": 1.9346717119581783e-06, "logits/chosen": -2.000105857849121, "logits/rejected": -1.972359538078308, "logps/chosen": -62.89327621459961, "logps/rejected": -47.83171844482422, "loss": 0.0694, "rewards/accuracies": 1.0, "rewards/chosen": 4.0964579582214355, "rewards/margins": 1.904973030090332, "rewards/rejected": 2.1914849281311035, "step": 6495 }, { "epoch": 1.44, "learning_rate": 1.933255912466565e-06, "logits/chosen": -1.613436222076416, "logits/rejected": -1.6083344221115112, "logps/chosen": -8.808656692504883, "logps/rejected": -5.599211692810059, "loss": 0.4926, "rewards/accuracies": 0.0, "rewards/chosen": 0.6491770148277283, "rewards/margins": -0.25046467781066895, "rewards/rejected": 0.8996416926383972, "step": 6496 }, { "epoch": 1.44, "learning_rate": 1.9318405070441947e-06, "logits/chosen": -2.0688536167144775, "logits/rejected": -1.6617094278335571, "logps/chosen": -48.795963287353516, "logps/rejected": -94.35887908935547, "loss": 0.1559, "rewards/accuracies": 1.0, "rewards/chosen": 4.244074821472168, "rewards/margins": 1.7769873142242432, "rewards/rejected": 2.467087507247925, "step": 6497 }, { "epoch": 1.44, "learning_rate": 1.9304254958729433e-06, "logits/chosen": -1.8118642568588257, "logits/rejected": -1.7561614513397217, "logps/chosen": -76.95259094238281, "logps/rejected": -42.43736267089844, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 7.119162082672119, "rewards/margins": 3.0531797409057617, "rewards/rejected": 4.065982341766357, "step": 6498 }, { "epoch": 1.44, "learning_rate": 1.9290108791346374e-06, "logits/chosen": -2.2391769886016846, "logits/rejected": -2.1625564098358154, "logps/chosen": -60.40379333496094, "logps/rejected": -147.29905700683594, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": 8.010452270507812, "rewards/margins": 4.168940544128418, "rewards/rejected": 3.8415114879608154, "step": 6499 }, { "epoch": 1.44, "learning_rate": 1.927596657011052e-06, "logits/chosen": -1.8206497430801392, "logits/rejected": -1.8779637813568115, "logps/chosen": -41.020633697509766, "logps/rejected": -26.18437385559082, "loss": 0.2012, "rewards/accuracies": 1.0, "rewards/chosen": 3.751127243041992, "rewards/margins": 1.1192057132720947, "rewards/rejected": 2.6319215297698975, "step": 6500 }, { "epoch": 1.44, "learning_rate": 1.926182829683909e-06, "logits/chosen": -2.024728775024414, "logits/rejected": -2.1190030574798584, "logps/chosen": -53.4743766784668, "logps/rejected": -112.16841125488281, "loss": 2.6024, "rewards/accuracies": 0.0, "rewards/chosen": 4.028067588806152, "rewards/margins": -5.132805824279785, "rewards/rejected": 9.160873413085938, "step": 6501 }, { "epoch": 1.44, "learning_rate": 1.9247693973348834e-06, "logits/chosen": -1.5768232345581055, "logits/rejected": -1.5768232345581055, "logps/chosen": -43.420013427734375, "logps/rejected": -43.420013427734375, "loss": 0.3752, "rewards/accuracies": 0.0, "rewards/chosen": 5.528972148895264, "rewards/margins": 0.0, "rewards/rejected": 5.528972148895264, "step": 6502 }, { "epoch": 1.44, "learning_rate": 1.9233563601455975e-06, "logits/chosen": -2.1041340827941895, "logits/rejected": -2.1358137130737305, "logps/chosen": -45.61188507080078, "logps/rejected": -199.304443359375, "loss": 1.2675, "rewards/accuracies": 0.0, "rewards/chosen": 5.490671634674072, "rewards/margins": -2.4490318298339844, "rewards/rejected": 7.939703464508057, "step": 6503 }, { "epoch": 1.44, "learning_rate": 1.921943718297623e-06, "logits/chosen": -2.1128861904144287, "logits/rejected": -2.0254852771759033, "logps/chosen": -10.57708740234375, "logps/rejected": -45.786800384521484, "loss": 0.433, "rewards/accuracies": 1.0, "rewards/chosen": 1.2747058868408203, "rewards/margins": 0.005315423011779785, "rewards/rejected": 1.2693904638290405, "step": 6504 }, { "epoch": 1.44, "learning_rate": 1.9205314719724815e-06, "logits/chosen": -2.063610553741455, "logits/rejected": -2.088603973388672, "logps/chosen": -78.14749145507812, "logps/rejected": -140.82174682617188, "loss": 0.0896, "rewards/accuracies": 1.0, "rewards/chosen": 3.5329949855804443, "rewards/margins": 1.636441946029663, "rewards/rejected": 1.8965530395507812, "step": 6505 }, { "epoch": 1.44, "learning_rate": 1.9191196213516427e-06, "logits/chosen": -2.090740442276001, "logits/rejected": -1.6186466217041016, "logps/chosen": -89.71733093261719, "logps/rejected": -124.03611755371094, "loss": 0.2953, "rewards/accuracies": 1.0, "rewards/chosen": 6.35800313949585, "rewards/margins": 0.4762296676635742, "rewards/rejected": 5.881773471832275, "step": 6506 }, { "epoch": 1.44, "learning_rate": 1.9177081666165265e-06, "logits/chosen": -1.5934873819351196, "logits/rejected": -1.5581719875335693, "logps/chosen": -28.634553909301758, "logps/rejected": -62.379783630371094, "loss": 0.1817, "rewards/accuracies": 1.0, "rewards/chosen": 2.620072841644287, "rewards/margins": 1.1531996726989746, "rewards/rejected": 1.4668731689453125, "step": 6507 }, { "epoch": 1.44, "learning_rate": 1.9162971079485015e-06, "logits/chosen": -2.032564163208008, "logits/rejected": -1.9506126642227173, "logps/chosen": -81.25025939941406, "logps/rejected": -97.00051879882812, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 6.288560390472412, "rewards/margins": 4.0581374168396, "rewards/rejected": 2.2304229736328125, "step": 6508 }, { "epoch": 1.44, "learning_rate": 1.9148864455288865e-06, "logits/chosen": -2.1545121669769287, "logits/rejected": -2.1621193885803223, "logps/chosen": -73.81629180908203, "logps/rejected": -110.00100708007812, "loss": 0.3831, "rewards/accuracies": 1.0, "rewards/chosen": 9.063739776611328, "rewards/margins": 2.57895565032959, "rewards/rejected": 6.484784126281738, "step": 6509 }, { "epoch": 1.44, "learning_rate": 1.913476179538945e-06, "logits/chosen": -2.215019941329956, "logits/rejected": -2.2258121967315674, "logps/chosen": -99.2077865600586, "logps/rejected": -48.353363037109375, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": 6.764353275299072, "rewards/margins": 3.0677382946014404, "rewards/rejected": 3.696614980697632, "step": 6510 }, { "epoch": 1.44, "learning_rate": 1.912066310159895e-06, "logits/chosen": -1.93053138256073, "logits/rejected": -1.9059501886367798, "logps/chosen": -18.695629119873047, "logps/rejected": -44.99861526489258, "loss": 0.481, "rewards/accuracies": 1.0, "rewards/chosen": 1.6165722608566284, "rewards/margins": 0.38111352920532227, "rewards/rejected": 1.2354587316513062, "step": 6511 }, { "epoch": 1.44, "learning_rate": 1.9106568375729007e-06, "logits/chosen": -1.870656132698059, "logits/rejected": -1.849968433380127, "logps/chosen": -90.68382263183594, "logps/rejected": -37.2696418762207, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": 8.640700340270996, "rewards/margins": 6.1035614013671875, "rewards/rejected": 2.5371387004852295, "step": 6512 }, { "epoch": 1.44, "learning_rate": 1.9092477619590766e-06, "logits/chosen": -2.1755666732788086, "logits/rejected": -2.1757185459136963, "logps/chosen": -41.513511657714844, "logps/rejected": -60.88597106933594, "loss": 0.1356, "rewards/accuracies": 1.0, "rewards/chosen": 4.049445629119873, "rewards/margins": 1.1707301139831543, "rewards/rejected": 2.8787155151367188, "step": 6513 }, { "epoch": 1.44, "learning_rate": 1.907839083499485e-06, "logits/chosen": -1.6590687036514282, "logits/rejected": -1.7267777919769287, "logps/chosen": -46.848915100097656, "logps/rejected": -75.80831146240234, "loss": 1.271, "rewards/accuracies": 0.0, "rewards/chosen": 4.866363048553467, "rewards/margins": -2.3310928344726562, "rewards/rejected": 7.197455883026123, "step": 6514 }, { "epoch": 1.44, "learning_rate": 1.9064308023751378e-06, "logits/chosen": -2.092435836791992, "logits/rejected": -2.086409091949463, "logps/chosen": -57.8785514831543, "logps/rejected": -61.3948974609375, "loss": 0.4189, "rewards/accuracies": 1.0, "rewards/chosen": 4.369578838348389, "rewards/margins": 0.6681902408599854, "rewards/rejected": 3.7013885974884033, "step": 6515 }, { "epoch": 1.44, "learning_rate": 1.905022918766995e-06, "logits/chosen": -1.9005355834960938, "logits/rejected": -1.8213366270065308, "logps/chosen": -46.31729507446289, "logps/rejected": -46.507728576660156, "loss": 0.1477, "rewards/accuracies": 1.0, "rewards/chosen": 4.346113204956055, "rewards/margins": 1.5566668510437012, "rewards/rejected": 2.7894463539123535, "step": 6516 }, { "epoch": 1.44, "learning_rate": 1.903615432855968e-06, "logits/chosen": -2.326882839202881, "logits/rejected": -2.3742692470550537, "logps/chosen": -83.87940216064453, "logps/rejected": -71.2118911743164, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 8.238595008850098, "rewards/margins": 6.033851623535156, "rewards/rejected": 2.2047431468963623, "step": 6517 }, { "epoch": 1.44, "learning_rate": 1.9022083448229139e-06, "logits/chosen": -1.9895923137664795, "logits/rejected": -1.9436882734298706, "logps/chosen": -55.91758728027344, "logps/rejected": -53.44268035888672, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 3.3869035243988037, "rewards/margins": 1.8266571760177612, "rewards/rejected": 1.5602463483810425, "step": 6518 }, { "epoch": 1.44, "learning_rate": 1.9008016548486425e-06, "logits/chosen": -1.6595245599746704, "logits/rejected": -1.6679902076721191, "logps/chosen": -49.24424743652344, "logps/rejected": -65.90518188476562, "loss": 0.2533, "rewards/accuracies": 1.0, "rewards/chosen": 3.816607713699341, "rewards/margins": 0.5201950073242188, "rewards/rejected": 3.296412706375122, "step": 6519 }, { "epoch": 1.44, "learning_rate": 1.8993953631139062e-06, "logits/chosen": -2.0336413383483887, "logits/rejected": -2.02944278717041, "logps/chosen": -32.91386413574219, "logps/rejected": -98.1977310180664, "loss": 0.4468, "rewards/accuracies": 1.0, "rewards/chosen": 3.4108054637908936, "rewards/margins": 0.0246124267578125, "rewards/rejected": 3.386193037033081, "step": 6520 }, { "epoch": 1.44, "learning_rate": 1.8979894697994122e-06, "logits/chosen": -1.884332299232483, "logits/rejected": -1.8653250932693481, "logps/chosen": -30.856128692626953, "logps/rejected": -71.95655822753906, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": 3.4117634296417236, "rewards/margins": 3.0615367889404297, "rewards/rejected": 0.35022661089897156, "step": 6521 }, { "epoch": 1.44, "learning_rate": 1.8965839750858146e-06, "logits/chosen": -1.6840115785598755, "logits/rejected": -1.6602470874786377, "logps/chosen": -29.644134521484375, "logps/rejected": -50.43281173706055, "loss": 0.1061, "rewards/accuracies": 1.0, "rewards/chosen": 4.213012218475342, "rewards/margins": 1.6657330989837646, "rewards/rejected": 2.547279119491577, "step": 6522 }, { "epoch": 1.44, "learning_rate": 1.8951788791537157e-06, "logits/chosen": -1.7605352401733398, "logits/rejected": -1.69484543800354, "logps/chosen": -155.93624877929688, "logps/rejected": -140.28146362304688, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": 8.796112060546875, "rewards/margins": 2.8492188453674316, "rewards/rejected": 5.946893215179443, "step": 6523 }, { "epoch": 1.44, "learning_rate": 1.893774182183667e-06, "logits/chosen": -1.9165441989898682, "logits/rejected": -1.9049264192581177, "logps/chosen": -86.62718963623047, "logps/rejected": -79.5624008178711, "loss": 0.1227, "rewards/accuracies": 1.0, "rewards/chosen": 8.017820358276367, "rewards/margins": 1.339395523071289, "rewards/rejected": 6.678424835205078, "step": 6524 }, { "epoch": 1.44, "learning_rate": 1.8923698843561684e-06, "logits/chosen": -2.126513719558716, "logits/rejected": -2.110046148300171, "logps/chosen": -38.654327392578125, "logps/rejected": -72.62776947021484, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 5.733145236968994, "rewards/margins": 1.466796875, "rewards/rejected": 4.266348361968994, "step": 6525 }, { "epoch": 1.44, "learning_rate": 1.8909659858516693e-06, "logits/chosen": -1.731718897819519, "logits/rejected": -1.759947419166565, "logps/chosen": -22.779163360595703, "logps/rejected": -42.33729934692383, "loss": 0.4515, "rewards/accuracies": 0.0, "rewards/chosen": 2.692919969558716, "rewards/margins": -0.3620333671569824, "rewards/rejected": 3.0549533367156982, "step": 6526 }, { "epoch": 1.44, "learning_rate": 1.8895624868505674e-06, "logits/chosen": -1.91570246219635, "logits/rejected": -1.8582324981689453, "logps/chosen": -37.82614517211914, "logps/rejected": -65.40959930419922, "loss": 0.1276, "rewards/accuracies": 1.0, "rewards/chosen": 4.509122848510742, "rewards/margins": 1.2656290531158447, "rewards/rejected": 3.2434937953948975, "step": 6527 }, { "epoch": 1.44, "learning_rate": 1.88815938753321e-06, "logits/chosen": -1.9996976852416992, "logits/rejected": -1.9996976852416992, "logps/chosen": -66.87539672851562, "logps/rejected": -66.87539672851562, "loss": 0.3668, "rewards/accuracies": 0.0, "rewards/chosen": 6.022270202636719, "rewards/margins": 0.0, "rewards/rejected": 6.022270202636719, "step": 6528 }, { "epoch": 1.45, "learning_rate": 1.8867566880798876e-06, "logits/chosen": -1.7373219728469849, "logits/rejected": -1.7189257144927979, "logps/chosen": -28.526142120361328, "logps/rejected": -32.625633239746094, "loss": 0.8563, "rewards/accuracies": 0.0, "rewards/chosen": 3.1937363147735596, "rewards/margins": -1.3938171863555908, "rewards/rejected": 4.58755350112915, "step": 6529 }, { "epoch": 1.45, "learning_rate": 1.8853543886708498e-06, "logits/chosen": -2.0252668857574463, "logits/rejected": -1.9350686073303223, "logps/chosen": -67.92988586425781, "logps/rejected": -33.12850570678711, "loss": 0.3229, "rewards/accuracies": 1.0, "rewards/chosen": 4.1259942054748535, "rewards/margins": 0.8868350982666016, "rewards/rejected": 3.239159107208252, "step": 6530 }, { "epoch": 1.45, "learning_rate": 1.8839524894862844e-06, "logits/chosen": -2.1774327754974365, "logits/rejected": -1.9799188375473022, "logps/chosen": -123.40910339355469, "logps/rejected": -169.94113159179688, "loss": 0.632, "rewards/accuracies": 0.0, "rewards/chosen": 12.268050193786621, "rewards/margins": -0.8493175506591797, "rewards/rejected": 13.1173677444458, "step": 6531 }, { "epoch": 1.45, "learning_rate": 1.8825509907063328e-06, "logits/chosen": -1.7288116216659546, "logits/rejected": -1.7547245025634766, "logps/chosen": -69.97561645507812, "logps/rejected": -82.15372467041016, "loss": 0.5785, "rewards/accuracies": 0.0, "rewards/chosen": 6.768902778625488, "rewards/margins": -0.7743277549743652, "rewards/rejected": 7.5432305335998535, "step": 6532 }, { "epoch": 1.45, "learning_rate": 1.8811498925110849e-06, "logits/chosen": -1.891183853149414, "logits/rejected": -1.891183853149414, "logps/chosen": -23.61286735534668, "logps/rejected": -23.61286735534668, "loss": 0.4379, "rewards/accuracies": 0.0, "rewards/chosen": 0.09010963886976242, "rewards/margins": 0.0, "rewards/rejected": 0.09010963886976242, "step": 6533 }, { "epoch": 1.45, "learning_rate": 1.8797491950805785e-06, "logits/chosen": -1.8434091806411743, "logits/rejected": -1.7666170597076416, "logps/chosen": -48.97533416748047, "logps/rejected": -50.272117614746094, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": 3.9916832447052, "rewards/margins": 2.923861026763916, "rewards/rejected": 1.0678223371505737, "step": 6534 }, { "epoch": 1.45, "learning_rate": 1.8783488985947995e-06, "logits/chosen": -1.9507339000701904, "logits/rejected": -1.8593616485595703, "logps/chosen": -65.17495727539062, "logps/rejected": -20.989946365356445, "loss": 0.1499, "rewards/accuracies": 1.0, "rewards/chosen": 2.4666335582733154, "rewards/margins": 1.2346409559249878, "rewards/rejected": 1.2319926023483276, "step": 6535 }, { "epoch": 1.45, "learning_rate": 1.8769490032336828e-06, "logits/chosen": -2.0976223945617676, "logits/rejected": -2.012423038482666, "logps/chosen": -113.72804260253906, "logps/rejected": -27.359375, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 7.462892055511475, "rewards/margins": 4.194058418273926, "rewards/rejected": 3.268833875656128, "step": 6536 }, { "epoch": 1.45, "learning_rate": 1.8755495091771137e-06, "logits/chosen": -2.2510173320770264, "logits/rejected": -2.196845769882202, "logps/chosen": -97.82901000976562, "logps/rejected": -59.774410247802734, "loss": 0.2366, "rewards/accuracies": 1.0, "rewards/chosen": 7.369314670562744, "rewards/margins": 2.325190544128418, "rewards/rejected": 5.044124126434326, "step": 6537 }, { "epoch": 1.45, "learning_rate": 1.8741504166049174e-06, "logits/chosen": -1.9795310497283936, "logits/rejected": -1.9095664024353027, "logps/chosen": -62.961891174316406, "logps/rejected": -13.9158935546875, "loss": 0.0796, "rewards/accuracies": 1.0, "rewards/chosen": 2.943126678466797, "rewards/margins": 2.5618338584899902, "rewards/rejected": 0.3812929093837738, "step": 6538 }, { "epoch": 1.45, "learning_rate": 1.8727517256968824e-06, "logits/chosen": -1.9246931076049805, "logits/rejected": -1.9246931076049805, "logps/chosen": -37.10200500488281, "logps/rejected": -37.10200500488281, "loss": 0.5287, "rewards/accuracies": 0.0, "rewards/chosen": 6.1389923095703125, "rewards/margins": 0.0, "rewards/rejected": 6.1389923095703125, "step": 6539 }, { "epoch": 1.45, "learning_rate": 1.8713534366327313e-06, "logits/chosen": -1.7288181781768799, "logits/rejected": -1.6680768728256226, "logps/chosen": -37.0185661315918, "logps/rejected": -53.435882568359375, "loss": 0.957, "rewards/accuracies": 0.0, "rewards/chosen": 2.6879894733428955, "rewards/margins": -1.7320277690887451, "rewards/rejected": 4.420017242431641, "step": 6540 }, { "epoch": 1.45, "learning_rate": 1.8699555495921418e-06, "logits/chosen": -1.884901523590088, "logits/rejected": -1.8249069452285767, "logps/chosen": -46.11251449584961, "logps/rejected": -42.33311462402344, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 5.529608726501465, "rewards/margins": 3.954866409301758, "rewards/rejected": 1.5747421979904175, "step": 6541 }, { "epoch": 1.45, "learning_rate": 1.8685580647547401e-06, "logits/chosen": -2.1651346683502197, "logits/rejected": -2.181396722793579, "logps/chosen": -55.777557373046875, "logps/rejected": -84.41633605957031, "loss": 0.1617, "rewards/accuracies": 1.0, "rewards/chosen": 5.207583904266357, "rewards/margins": 1.5201494693756104, "rewards/rejected": 3.687434434890747, "step": 6542 }, { "epoch": 1.45, "learning_rate": 1.8671609823000998e-06, "logits/chosen": -1.8066771030426025, "logits/rejected": -1.7768057584762573, "logps/chosen": -48.712955474853516, "logps/rejected": -45.72920227050781, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 4.005277633666992, "rewards/margins": 2.2530345916748047, "rewards/rejected": 1.7522430419921875, "step": 6543 }, { "epoch": 1.45, "learning_rate": 1.8657643024077431e-06, "logits/chosen": -1.7472221851348877, "logits/rejected": -1.6612154245376587, "logps/chosen": -37.34309387207031, "logps/rejected": -71.88604736328125, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": 2.6307809352874756, "rewards/margins": 1.8134537935256958, "rewards/rejected": 0.8173271417617798, "step": 6544 }, { "epoch": 1.45, "learning_rate": 1.8643680252571362e-06, "logits/chosen": -1.9458112716674805, "logits/rejected": -1.8776527643203735, "logps/chosen": -98.59302520751953, "logps/rejected": -38.79143524169922, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 5.7989935874938965, "rewards/margins": 2.367448329925537, "rewards/rejected": 3.4315452575683594, "step": 6545 }, { "epoch": 1.45, "learning_rate": 1.8629721510277031e-06, "logits/chosen": -1.6859707832336426, "logits/rejected": -1.591588020324707, "logps/chosen": -63.86241149902344, "logps/rejected": -37.27948760986328, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 3.45343017578125, "rewards/margins": 2.1797475814819336, "rewards/rejected": 1.2736824750900269, "step": 6546 }, { "epoch": 1.45, "learning_rate": 1.8615766798988043e-06, "logits/chosen": -2.1052463054656982, "logits/rejected": -2.081591844558716, "logps/chosen": -26.310184478759766, "logps/rejected": -64.06510925292969, "loss": 0.3245, "rewards/accuracies": 1.0, "rewards/chosen": 3.9203593730926514, "rewards/margins": 0.11471748352050781, "rewards/rejected": 3.8056418895721436, "step": 6547 }, { "epoch": 1.45, "learning_rate": 1.8601816120497613e-06, "logits/chosen": -1.998738408088684, "logits/rejected": -1.9685901403427124, "logps/chosen": -41.637176513671875, "logps/rejected": -45.817848205566406, "loss": 0.1259, "rewards/accuracies": 1.0, "rewards/chosen": 4.820615291595459, "rewards/margins": 1.8512351512908936, "rewards/rejected": 2.9693801403045654, "step": 6548 }, { "epoch": 1.45, "learning_rate": 1.8587869476598292e-06, "logits/chosen": -1.8772156238555908, "logits/rejected": -1.865645170211792, "logps/chosen": -39.402015686035156, "logps/rejected": -95.38795471191406, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 4.013875484466553, "rewards/margins": 1.4099509716033936, "rewards/rejected": 2.603924512863159, "step": 6549 }, { "epoch": 1.45, "learning_rate": 1.857392686908227e-06, "logits/chosen": -1.8964884281158447, "logits/rejected": -1.9080442190170288, "logps/chosen": -62.76258850097656, "logps/rejected": -110.34876251220703, "loss": 1.1134, "rewards/accuracies": 0.0, "rewards/chosen": 6.933294773101807, "rewards/margins": -2.031160831451416, "rewards/rejected": 8.964455604553223, "step": 6550 }, { "epoch": 1.45, "learning_rate": 1.8559988299741078e-06, "logits/chosen": -2.056919574737549, "logits/rejected": -2.0664639472961426, "logps/chosen": -47.91051483154297, "logps/rejected": -66.837890625, "loss": 1.8588, "rewards/accuracies": 0.0, "rewards/chosen": 3.6137771606445312, "rewards/margins": -3.5831985473632812, "rewards/rejected": 7.1969757080078125, "step": 6551 }, { "epoch": 1.45, "learning_rate": 1.8546053770365814e-06, "logits/chosen": -2.108098268508911, "logits/rejected": -2.0509579181671143, "logps/chosen": -134.67222595214844, "logps/rejected": -20.928983688354492, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": 9.866854667663574, "rewards/margins": 9.140473365783691, "rewards/rejected": 0.7263811230659485, "step": 6552 }, { "epoch": 1.45, "learning_rate": 1.853212328274704e-06, "logits/chosen": -1.953458309173584, "logits/rejected": -1.953458309173584, "logps/chosen": -35.218055725097656, "logps/rejected": -35.218055725097656, "loss": 0.3968, "rewards/accuracies": 0.0, "rewards/chosen": 3.3883721828460693, "rewards/margins": 0.0, "rewards/rejected": 3.3883721828460693, "step": 6553 }, { "epoch": 1.45, "learning_rate": 1.8518196838674745e-06, "logits/chosen": -1.878402590751648, "logits/rejected": -1.8780581951141357, "logps/chosen": -57.068687438964844, "logps/rejected": -62.998653411865234, "loss": 1.1221, "rewards/accuracies": 0.0, "rewards/chosen": 6.2258124351501465, "rewards/margins": -2.047410488128662, "rewards/rejected": 8.273222923278809, "step": 6554 }, { "epoch": 1.45, "learning_rate": 1.8504274439938507e-06, "logits/chosen": -1.8117289543151855, "logits/rejected": -1.8208829164505005, "logps/chosen": -44.69078063964844, "logps/rejected": -46.44109344482422, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 4.075894355773926, "rewards/margins": 0.6484253406524658, "rewards/rejected": 3.42746901512146, "step": 6555 }, { "epoch": 1.45, "learning_rate": 1.8490356088327261e-06, "logits/chosen": -2.083425998687744, "logits/rejected": -2.0533714294433594, "logps/chosen": -73.40794372558594, "logps/rejected": -56.835243225097656, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": 6.0368852615356445, "rewards/margins": 2.735546350479126, "rewards/rejected": 3.3013389110565186, "step": 6556 }, { "epoch": 1.45, "learning_rate": 1.847644178562954e-06, "logits/chosen": -2.0229384899139404, "logits/rejected": -1.936559796333313, "logps/chosen": -35.049560546875, "logps/rejected": -17.372774124145508, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": 3.3597450256347656, "rewards/margins": 1.7792836427688599, "rewards/rejected": 1.5804613828659058, "step": 6557 }, { "epoch": 1.45, "learning_rate": 1.8462531533633238e-06, "logits/chosen": -1.934646487236023, "logits/rejected": -1.841952919960022, "logps/chosen": -154.4564208984375, "logps/rejected": -50.17313003540039, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 7.469592571258545, "rewards/margins": 5.61026668548584, "rewards/rejected": 1.8593257665634155, "step": 6558 }, { "epoch": 1.45, "learning_rate": 1.8448625334125853e-06, "logits/chosen": -1.8069106340408325, "logits/rejected": -1.771011233329773, "logps/chosen": -48.660396575927734, "logps/rejected": -49.696685791015625, "loss": 0.0811, "rewards/accuracies": 1.0, "rewards/chosen": 5.3297810554504395, "rewards/margins": 1.80924654006958, "rewards/rejected": 3.5205345153808594, "step": 6559 }, { "epoch": 1.45, "learning_rate": 1.843472318889425e-06, "logits/chosen": -1.8679732084274292, "logits/rejected": -1.7415598630905151, "logps/chosen": -40.46906661987305, "logps/rejected": -21.418975830078125, "loss": 0.0778, "rewards/accuracies": 1.0, "rewards/chosen": 3.381030797958374, "rewards/margins": 1.8346021175384521, "rewards/rejected": 1.5464286804199219, "step": 6560 }, { "epoch": 1.45, "learning_rate": 1.8420825099724837e-06, "logits/chosen": -1.6515263319015503, "logits/rejected": -1.6842525005340576, "logps/chosen": -14.238119125366211, "logps/rejected": -36.898921966552734, "loss": 0.3158, "rewards/accuracies": 1.0, "rewards/chosen": 2.443856954574585, "rewards/margins": 0.25673341751098633, "rewards/rejected": 2.1871235370635986, "step": 6561 }, { "epoch": 1.45, "learning_rate": 1.84069310684035e-06, "logits/chosen": -1.7519787549972534, "logits/rejected": -1.7135111093521118, "logps/chosen": -63.759613037109375, "logps/rejected": -120.4419937133789, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": 6.9745635986328125, "rewards/margins": 2.609555721282959, "rewards/rejected": 4.3650078773498535, "step": 6562 }, { "epoch": 1.45, "learning_rate": 1.8393041096715536e-06, "logits/chosen": -2.045401096343994, "logits/rejected": -2.0527894496917725, "logps/chosen": -35.63804626464844, "logps/rejected": -64.84672546386719, "loss": 0.2542, "rewards/accuracies": 1.0, "rewards/chosen": 5.207258701324463, "rewards/margins": 1.2255113124847412, "rewards/rejected": 3.9817473888397217, "step": 6563 }, { "epoch": 1.45, "learning_rate": 1.8379155186445846e-06, "logits/chosen": -2.1379127502441406, "logits/rejected": -2.1011509895324707, "logps/chosen": -100.54006958007812, "logps/rejected": -48.39470672607422, "loss": 0.0688, "rewards/accuracies": 1.0, "rewards/chosen": 8.161209106445312, "rewards/margins": 2.2653450965881348, "rewards/rejected": 5.895864009857178, "step": 6564 }, { "epoch": 1.45, "learning_rate": 1.836527333937867e-06, "logits/chosen": -1.8686405420303345, "logits/rejected": -1.7906239032745361, "logps/chosen": -82.05965423583984, "logps/rejected": -56.592628479003906, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 7.174204349517822, "rewards/margins": 3.8115761280059814, "rewards/rejected": 3.362628221511841, "step": 6565 }, { "epoch": 1.45, "learning_rate": 1.8351395557297852e-06, "logits/chosen": -1.836181879043579, "logits/rejected": -1.804625153541565, "logps/chosen": -31.6878604888916, "logps/rejected": -48.950748443603516, "loss": 0.2491, "rewards/accuracies": 1.0, "rewards/chosen": 4.070061683654785, "rewards/margins": 0.44462037086486816, "rewards/rejected": 3.625441312789917, "step": 6566 }, { "epoch": 1.45, "learning_rate": 1.8337521841986605e-06, "logits/chosen": -1.757632851600647, "logits/rejected": -1.8059370517730713, "logps/chosen": -62.0889778137207, "logps/rejected": -79.82551574707031, "loss": 0.2354, "rewards/accuracies": 1.0, "rewards/chosen": 9.504069328308105, "rewards/margins": 0.6462001800537109, "rewards/rejected": 8.857869148254395, "step": 6567 }, { "epoch": 1.45, "learning_rate": 1.8323652195227692e-06, "logits/chosen": -1.975067377090454, "logits/rejected": -1.9255362749099731, "logps/chosen": -110.0291519165039, "logps/rejected": -96.73124694824219, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 10.416622161865234, "rewards/margins": 2.4905996322631836, "rewards/rejected": 7.926022529602051, "step": 6568 }, { "epoch": 1.45, "learning_rate": 1.830978661880332e-06, "logits/chosen": -1.9287608861923218, "logits/rejected": -2.0293917655944824, "logps/chosen": -27.834999084472656, "logps/rejected": -151.76004028320312, "loss": 1.4657, "rewards/accuracies": 0.0, "rewards/chosen": 4.716555118560791, "rewards/margins": -2.815640926361084, "rewards/rejected": 7.532196044921875, "step": 6569 }, { "epoch": 1.45, "learning_rate": 1.829592511449519e-06, "logits/chosen": -2.0919954776763916, "logits/rejected": -2.052812099456787, "logps/chosen": -68.59446716308594, "logps/rejected": -61.76234436035156, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 5.17687463760376, "rewards/margins": 2.729487657546997, "rewards/rejected": 2.4473869800567627, "step": 6570 }, { "epoch": 1.45, "learning_rate": 1.828206768408448e-06, "logits/chosen": -1.9222304821014404, "logits/rejected": -1.6745980978012085, "logps/chosen": -141.34457397460938, "logps/rejected": -75.3311767578125, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": 6.947020053863525, "rewards/margins": 5.488931179046631, "rewards/rejected": 1.458088755607605, "step": 6571 }, { "epoch": 1.45, "learning_rate": 1.8268214329351797e-06, "logits/chosen": -1.8520135879516602, "logits/rejected": -1.8379011154174805, "logps/chosen": -42.228515625, "logps/rejected": -68.06442260742188, "loss": 0.2184, "rewards/accuracies": 1.0, "rewards/chosen": 4.435003757476807, "rewards/margins": 1.054328203201294, "rewards/rejected": 3.3806755542755127, "step": 6572 }, { "epoch": 1.45, "learning_rate": 1.8254365052077322e-06, "logits/chosen": -1.6456962823867798, "logits/rejected": -1.6387344598770142, "logps/chosen": -66.58068084716797, "logps/rejected": -89.71101379394531, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": 6.3882269859313965, "rewards/margins": 1.672600269317627, "rewards/rejected": 4.7156267166137695, "step": 6573 }, { "epoch": 1.46, "learning_rate": 1.8240519854040589e-06, "logits/chosen": -1.9316567182540894, "logits/rejected": -1.9291393756866455, "logps/chosen": -39.811344146728516, "logps/rejected": -71.76482391357422, "loss": 0.4102, "rewards/accuracies": 0.0, "rewards/chosen": 3.0801875591278076, "rewards/margins": -0.04532146453857422, "rewards/rejected": 3.125509023666382, "step": 6574 }, { "epoch": 1.46, "learning_rate": 1.8226678737020742e-06, "logits/chosen": -2.0504488945007324, "logits/rejected": -2.0504488945007324, "logps/chosen": -48.453731536865234, "logps/rejected": -48.453731536865234, "loss": 0.3474, "rewards/accuracies": 0.0, "rewards/chosen": 4.869873523712158, "rewards/margins": 0.0, "rewards/rejected": 4.869873523712158, "step": 6575 }, { "epoch": 1.46, "learning_rate": 1.821284170279628e-06, "logits/chosen": -1.796103835105896, "logits/rejected": -1.7245023250579834, "logps/chosen": -82.43717956542969, "logps/rejected": -59.663185119628906, "loss": 0.1563, "rewards/accuracies": 1.0, "rewards/chosen": 4.560673713684082, "rewards/margins": 1.5087060928344727, "rewards/rejected": 3.0519676208496094, "step": 6576 }, { "epoch": 1.46, "learning_rate": 1.8199008753145241e-06, "logits/chosen": -2.2009670734405518, "logits/rejected": -2.185399055480957, "logps/chosen": -52.143104553222656, "logps/rejected": -60.189579010009766, "loss": 0.6064, "rewards/accuracies": 0.0, "rewards/chosen": 3.0118911266326904, "rewards/margins": -0.8448247909545898, "rewards/rejected": 3.8567159175872803, "step": 6577 }, { "epoch": 1.46, "learning_rate": 1.8185179889845135e-06, "logits/chosen": -1.7205122709274292, "logits/rejected": -1.7205122709274292, "logps/chosen": -41.661373138427734, "logps/rejected": -41.661373138427734, "loss": 1.2177, "rewards/accuracies": 0.0, "rewards/chosen": 4.053529739379883, "rewards/margins": 0.0, "rewards/rejected": 4.053529739379883, "step": 6578 }, { "epoch": 1.46, "learning_rate": 1.817135511467293e-06, "logits/chosen": -1.844901442527771, "logits/rejected": -1.8509600162506104, "logps/chosen": -29.584436416625977, "logps/rejected": -51.58848571777344, "loss": 0.3775, "rewards/accuracies": 1.0, "rewards/chosen": 3.1627042293548584, "rewards/margins": 0.8188412189483643, "rewards/rejected": 2.343863010406494, "step": 6579 }, { "epoch": 1.46, "learning_rate": 1.8157534429405083e-06, "logits/chosen": -2.023355007171631, "logits/rejected": -2.03946590423584, "logps/chosen": -85.015380859375, "logps/rejected": -121.82935333251953, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": 8.497309684753418, "rewards/margins": 2.4335989952087402, "rewards/rejected": 6.063710689544678, "step": 6580 }, { "epoch": 1.46, "learning_rate": 1.8143717835817509e-06, "logits/chosen": -2.3024282455444336, "logits/rejected": -2.314729928970337, "logps/chosen": -23.406064987182617, "logps/rejected": -85.12797546386719, "loss": 0.3987, "rewards/accuracies": 0.0, "rewards/chosen": 3.2759499549865723, "rewards/margins": -0.17330694198608398, "rewards/rejected": 3.4492568969726562, "step": 6581 }, { "epoch": 1.46, "learning_rate": 1.8129905335685631e-06, "logits/chosen": -1.8757442235946655, "logits/rejected": -1.793834924697876, "logps/chosen": -29.51113510131836, "logps/rejected": -7.595640182495117, "loss": 0.4179, "rewards/accuracies": 1.0, "rewards/chosen": 2.6779229640960693, "rewards/margins": 1.8747694492340088, "rewards/rejected": 0.8031534552574158, "step": 6582 }, { "epoch": 1.46, "learning_rate": 1.811609693078427e-06, "logits/chosen": -2.0668256282806396, "logits/rejected": -1.9572359323501587, "logps/chosen": -90.24281311035156, "logps/rejected": -51.86579895019531, "loss": 0.0657, "rewards/accuracies": 1.0, "rewards/chosen": 6.001089572906494, "rewards/margins": 1.9693207740783691, "rewards/rejected": 4.031768798828125, "step": 6583 }, { "epoch": 1.46, "learning_rate": 1.8102292622887841e-06, "logits/chosen": -1.9902693033218384, "logits/rejected": -1.9843487739562988, "logps/chosen": -39.79852294921875, "logps/rejected": -52.559791564941406, "loss": 0.3171, "rewards/accuracies": 1.0, "rewards/chosen": 3.826568603515625, "rewards/margins": 0.282681941986084, "rewards/rejected": 3.543886661529541, "step": 6584 }, { "epoch": 1.46, "learning_rate": 1.8088492413770116e-06, "logits/chosen": -1.7733662128448486, "logits/rejected": -1.7549282312393188, "logps/chosen": -33.38893508911133, "logps/rejected": -34.861446380615234, "loss": 0.1355, "rewards/accuracies": 1.0, "rewards/chosen": 2.713371753692627, "rewards/margins": 1.4061768054962158, "rewards/rejected": 1.3071949481964111, "step": 6585 }, { "epoch": 1.46, "learning_rate": 1.8074696305204397e-06, "logits/chosen": -1.7614015340805054, "logits/rejected": -1.7808973789215088, "logps/chosen": -91.24269104003906, "logps/rejected": -116.43974304199219, "loss": 0.1647, "rewards/accuracies": 1.0, "rewards/chosen": 10.572084426879883, "rewards/margins": 0.9872007369995117, "rewards/rejected": 9.584883689880371, "step": 6586 }, { "epoch": 1.46, "learning_rate": 1.806090429896346e-06, "logits/chosen": -1.5548988580703735, "logits/rejected": -1.5591301918029785, "logps/chosen": -26.204978942871094, "logps/rejected": -25.422264099121094, "loss": 0.7175, "rewards/accuracies": 0.0, "rewards/chosen": 2.950129747390747, "rewards/margins": -0.6674232482910156, "rewards/rejected": 3.6175529956817627, "step": 6587 }, { "epoch": 1.46, "learning_rate": 1.804711639681954e-06, "logits/chosen": -2.2623074054718018, "logits/rejected": -2.2762372493743896, "logps/chosen": -100.74461364746094, "logps/rejected": -92.539794921875, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 8.307283401489258, "rewards/margins": 3.6024250984191895, "rewards/rejected": 4.704858303070068, "step": 6588 }, { "epoch": 1.46, "learning_rate": 1.8033332600544351e-06, "logits/chosen": -1.6627631187438965, "logits/rejected": -1.6632493734359741, "logps/chosen": -49.241512298583984, "logps/rejected": -81.37092590332031, "loss": 0.5621, "rewards/accuracies": 0.0, "rewards/chosen": 4.868613243103027, "rewards/margins": -0.7184700965881348, "rewards/rejected": 5.587083339691162, "step": 6589 }, { "epoch": 1.46, "learning_rate": 1.8019552911909077e-06, "logits/chosen": -1.9215354919433594, "logits/rejected": -1.885000467300415, "logps/chosen": -59.05033874511719, "logps/rejected": -95.97236633300781, "loss": 0.1225, "rewards/accuracies": 1.0, "rewards/chosen": 4.152761936187744, "rewards/margins": 1.409517765045166, "rewards/rejected": 2.743244171142578, "step": 6590 }, { "epoch": 1.46, "learning_rate": 1.8005777332684393e-06, "logits/chosen": -1.8911657333374023, "logits/rejected": -1.834205150604248, "logps/chosen": -46.991294860839844, "logps/rejected": -87.03922271728516, "loss": 0.1417, "rewards/accuracies": 1.0, "rewards/chosen": 3.473320722579956, "rewards/margins": 1.1816902160644531, "rewards/rejected": 2.291630506515503, "step": 6591 }, { "epoch": 1.46, "learning_rate": 1.7992005864640371e-06, "logits/chosen": -1.835436224937439, "logits/rejected": -1.835436224937439, "logps/chosen": -14.286474227905273, "logps/rejected": -14.286474227905273, "loss": 0.3734, "rewards/accuracies": 0.0, "rewards/chosen": 2.967778205871582, "rewards/margins": 0.0, "rewards/rejected": 2.967778205871582, "step": 6592 }, { "epoch": 1.46, "learning_rate": 1.7978238509546687e-06, "logits/chosen": -1.887231707572937, "logits/rejected": -1.8533265590667725, "logps/chosen": -113.28955078125, "logps/rejected": -102.7450942993164, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 10.367807388305664, "rewards/margins": 3.986557960510254, "rewards/rejected": 6.38124942779541, "step": 6593 }, { "epoch": 1.46, "learning_rate": 1.796447526917236e-06, "logits/chosen": -1.7610810995101929, "logits/rejected": -1.680496096611023, "logps/chosen": -62.907222747802734, "logps/rejected": -29.671485900878906, "loss": 0.3797, "rewards/accuracies": 1.0, "rewards/chosen": 3.793928861618042, "rewards/margins": 2.680206298828125, "rewards/rejected": 1.1137226819992065, "step": 6594 }, { "epoch": 1.46, "learning_rate": 1.795071614528595e-06, "logits/chosen": -1.718130111694336, "logits/rejected": -1.7055214643478394, "logps/chosen": -30.434080123901367, "logps/rejected": -36.954437255859375, "loss": 0.3009, "rewards/accuracies": 1.0, "rewards/chosen": 1.9881147146224976, "rewards/margins": 0.192968487739563, "rewards/rejected": 1.7951462268829346, "step": 6595 }, { "epoch": 1.46, "learning_rate": 1.7936961139655473e-06, "logits/chosen": -2.010542869567871, "logits/rejected": -1.9674345254898071, "logps/chosen": -31.25970458984375, "logps/rejected": -46.9766960144043, "loss": 0.6228, "rewards/accuracies": 0.0, "rewards/chosen": 4.5740180015563965, "rewards/margins": -0.30410051345825195, "rewards/rejected": 4.878118515014648, "step": 6596 }, { "epoch": 1.46, "learning_rate": 1.7923210254048412e-06, "logits/chosen": -1.8396527767181396, "logits/rejected": -1.8336302042007446, "logps/chosen": -43.025569915771484, "logps/rejected": -58.9634895324707, "loss": 0.0882, "rewards/accuracies": 1.0, "rewards/chosen": 4.710164546966553, "rewards/margins": 1.7815172672271729, "rewards/rejected": 2.92864727973938, "step": 6597 }, { "epoch": 1.46, "learning_rate": 1.7909463490231716e-06, "logits/chosen": -1.8570330142974854, "logits/rejected": -1.8084965944290161, "logps/chosen": -21.185707092285156, "logps/rejected": -9.291213989257812, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 4.0810980796813965, "rewards/margins": 3.2244224548339844, "rewards/rejected": 0.8566757440567017, "step": 6598 }, { "epoch": 1.46, "learning_rate": 1.7895720849971826e-06, "logits/chosen": -1.907973289489746, "logits/rejected": -1.8659226894378662, "logps/chosen": -63.15404510498047, "logps/rejected": -96.49441528320312, "loss": 0.0722, "rewards/accuracies": 1.0, "rewards/chosen": 9.796990394592285, "rewards/margins": 1.8721184730529785, "rewards/rejected": 7.924871921539307, "step": 6599 }, { "epoch": 1.46, "learning_rate": 1.7881982335034625e-06, "logits/chosen": -1.9521273374557495, "logits/rejected": -1.8934649229049683, "logps/chosen": -112.37862396240234, "logps/rejected": -118.70240783691406, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": 9.163138389587402, "rewards/margins": 2.69016170501709, "rewards/rejected": 6.4729766845703125, "step": 6600 }, { "epoch": 1.46, "learning_rate": 1.7868247947185486e-06, "logits/chosen": -2.0172066688537598, "logits/rejected": -2.0615971088409424, "logps/chosen": -32.62513732910156, "logps/rejected": -73.06363677978516, "loss": 0.8525, "rewards/accuracies": 0.0, "rewards/chosen": 5.612668514251709, "rewards/margins": -1.2002811431884766, "rewards/rejected": 6.8129496574401855, "step": 6601 }, { "epoch": 1.46, "learning_rate": 1.7854517688189261e-06, "logits/chosen": -1.8258800506591797, "logits/rejected": -1.8304303884506226, "logps/chosen": -41.44757843017578, "logps/rejected": -75.13558959960938, "loss": 0.6131, "rewards/accuracies": 1.0, "rewards/chosen": 5.192269325256348, "rewards/margins": 0.588015079498291, "rewards/rejected": 4.604254245758057, "step": 6602 }, { "epoch": 1.46, "learning_rate": 1.7840791559810222e-06, "logits/chosen": -1.9951246976852417, "logits/rejected": -1.9073486328125, "logps/chosen": -64.10608673095703, "logps/rejected": -102.2379379272461, "loss": 0.2142, "rewards/accuracies": 1.0, "rewards/chosen": 9.399358749389648, "rewards/margins": 2.3611531257629395, "rewards/rejected": 7.038205623626709, "step": 6603 }, { "epoch": 1.46, "learning_rate": 1.7827069563812156e-06, "logits/chosen": -1.941489815711975, "logits/rejected": -1.9230144023895264, "logps/chosen": -57.419471740722656, "logps/rejected": -42.791378021240234, "loss": 0.2193, "rewards/accuracies": 1.0, "rewards/chosen": 3.9850685596466064, "rewards/margins": 0.6060519218444824, "rewards/rejected": 3.379016637802124, "step": 6604 }, { "epoch": 1.46, "learning_rate": 1.781335170195831e-06, "logits/chosen": -2.145754098892212, "logits/rejected": -2.131441354751587, "logps/chosen": -61.10689926147461, "logps/rejected": -73.81132507324219, "loss": 0.5352, "rewards/accuracies": 0.0, "rewards/chosen": 2.8130619525909424, "rewards/margins": -0.2375330924987793, "rewards/rejected": 3.0505950450897217, "step": 6605 }, { "epoch": 1.46, "learning_rate": 1.7799637976011397e-06, "logits/chosen": -1.9424206018447876, "logits/rejected": -1.9783872365951538, "logps/chosen": -37.6660270690918, "logps/rejected": -84.29618835449219, "loss": 0.2689, "rewards/accuracies": 1.0, "rewards/chosen": 2.793177366256714, "rewards/margins": 0.6634960174560547, "rewards/rejected": 2.129681348800659, "step": 6606 }, { "epoch": 1.46, "learning_rate": 1.7785928387733598e-06, "logits/chosen": -1.8630239963531494, "logits/rejected": -1.8149144649505615, "logps/chosen": -39.33502197265625, "logps/rejected": -63.041778564453125, "loss": 0.2201, "rewards/accuracies": 1.0, "rewards/chosen": 3.3658998012542725, "rewards/margins": 0.7717835903167725, "rewards/rejected": 2.5941162109375, "step": 6607 }, { "epoch": 1.46, "learning_rate": 1.7772222938886557e-06, "logits/chosen": -2.3630146980285645, "logits/rejected": -2.366952896118164, "logps/chosen": -49.26249694824219, "logps/rejected": -49.479515075683594, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 7.970880031585693, "rewards/margins": 4.718789100646973, "rewards/rejected": 3.2520911693573, "step": 6608 }, { "epoch": 1.46, "learning_rate": 1.7758521631231396e-06, "logits/chosen": -2.2193174362182617, "logits/rejected": -2.166592836380005, "logps/chosen": -104.0090103149414, "logps/rejected": -83.0832748413086, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 7.037087440490723, "rewards/margins": 2.607396125793457, "rewards/rejected": 4.429691314697266, "step": 6609 }, { "epoch": 1.46, "learning_rate": 1.7744824466528699e-06, "logits/chosen": -2.0242795944213867, "logits/rejected": -1.6504064798355103, "logps/chosen": -26.497482299804688, "logps/rejected": -74.406005859375, "loss": 0.6676, "rewards/accuracies": 0.0, "rewards/chosen": 3.530388593673706, "rewards/margins": -0.2641129493713379, "rewards/rejected": 3.794501543045044, "step": 6610 }, { "epoch": 1.46, "learning_rate": 1.7731131446538536e-06, "logits/chosen": -1.7270780801773071, "logits/rejected": -1.6194347143173218, "logps/chosen": -57.49921417236328, "logps/rejected": -43.306034088134766, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 7.669434547424316, "rewards/margins": 2.668552875518799, "rewards/rejected": 5.000881671905518, "step": 6611 }, { "epoch": 1.46, "learning_rate": 1.7717442573020393e-06, "logits/chosen": -1.862247347831726, "logits/rejected": -1.6070079803466797, "logps/chosen": -94.35801696777344, "logps/rejected": -22.314783096313477, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 7.785025119781494, "rewards/margins": 6.2480034828186035, "rewards/rejected": 1.537021517753601, "step": 6612 }, { "epoch": 1.46, "learning_rate": 1.7703757847733272e-06, "logits/chosen": -1.842060923576355, "logits/rejected": -1.3926252126693726, "logps/chosen": -123.46522521972656, "logps/rejected": -25.08409309387207, "loss": 0.0592, "rewards/accuracies": 1.0, "rewards/chosen": 5.920947551727295, "rewards/margins": 2.086423635482788, "rewards/rejected": 3.834523916244507, "step": 6613 }, { "epoch": 1.46, "learning_rate": 1.7690077272435636e-06, "logits/chosen": -2.0826451778411865, "logits/rejected": -2.0207619667053223, "logps/chosen": -60.2095832824707, "logps/rejected": -23.42668914794922, "loss": 0.2068, "rewards/accuracies": 1.0, "rewards/chosen": 2.903593063354492, "rewards/margins": 1.8218730688095093, "rewards/rejected": 1.081719994544983, "step": 6614 }, { "epoch": 1.46, "learning_rate": 1.7676400848885394e-06, "logits/chosen": -1.763671875, "logits/rejected": -1.4332208633422852, "logps/chosen": -69.84357452392578, "logps/rejected": -52.85785675048828, "loss": 1.0969, "rewards/accuracies": 0.0, "rewards/chosen": 4.890805244445801, "rewards/margins": -2.0630431175231934, "rewards/rejected": 6.953848361968994, "step": 6615 }, { "epoch": 1.46, "learning_rate": 1.766272857883994e-06, "logits/chosen": -1.9038152694702148, "logits/rejected": -1.8472614288330078, "logps/chosen": -46.44667053222656, "logps/rejected": -62.434906005859375, "loss": 0.1332, "rewards/accuracies": 1.0, "rewards/chosen": 3.2549362182617188, "rewards/margins": 1.439788818359375, "rewards/rejected": 1.8151473999023438, "step": 6616 }, { "epoch": 1.46, "learning_rate": 1.7649060464056129e-06, "logits/chosen": -2.224355459213257, "logits/rejected": -2.227416515350342, "logps/chosen": -41.80447006225586, "logps/rejected": -73.12459564208984, "loss": 0.119, "rewards/accuracies": 1.0, "rewards/chosen": 2.9324796199798584, "rewards/margins": 1.3463704586029053, "rewards/rejected": 1.5861091613769531, "step": 6617 }, { "epoch": 1.46, "learning_rate": 1.7635396506290276e-06, "logits/chosen": -1.959256887435913, "logits/rejected": -1.9229402542114258, "logps/chosen": -53.45962142944336, "logps/rejected": -83.66217041015625, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 4.180765151977539, "rewards/margins": 3.7164928913116455, "rewards/rejected": 0.46427232027053833, "step": 6618 }, { "epoch": 1.47, "learning_rate": 1.7621736707298187e-06, "logits/chosen": -2.0570425987243652, "logits/rejected": -1.586871862411499, "logps/chosen": -61.27072525024414, "logps/rejected": -56.71266174316406, "loss": 0.1164, "rewards/accuracies": 1.0, "rewards/chosen": 3.9480130672454834, "rewards/margins": 1.343470573425293, "rewards/rejected": 2.6045424938201904, "step": 6619 }, { "epoch": 1.47, "learning_rate": 1.7608081068835065e-06, "logits/chosen": -1.8362356424331665, "logits/rejected": -1.9629579782485962, "logps/chosen": -29.295665740966797, "logps/rejected": -128.76138305664062, "loss": 3.8687, "rewards/accuracies": 0.0, "rewards/chosen": 3.7505688667297363, "rewards/margins": -7.630702495574951, "rewards/rejected": 11.381271362304688, "step": 6620 }, { "epoch": 1.47, "learning_rate": 1.7594429592655687e-06, "logits/chosen": -1.9580340385437012, "logits/rejected": -1.8901128768920898, "logps/chosen": -47.45363998413086, "logps/rejected": -51.90692901611328, "loss": 0.2605, "rewards/accuracies": 1.0, "rewards/chosen": 2.31870698928833, "rewards/margins": 2.3483896255493164, "rewards/rejected": -0.029682541266083717, "step": 6621 }, { "epoch": 1.47, "learning_rate": 1.758078228051419e-06, "logits/chosen": -2.0199105739593506, "logits/rejected": -2.030035972595215, "logps/chosen": -33.623809814453125, "logps/rejected": -57.49029541015625, "loss": 0.5244, "rewards/accuracies": 1.0, "rewards/chosen": 3.3171722888946533, "rewards/margins": 1.564771294593811, "rewards/rejected": 1.7524009943008423, "step": 6622 }, { "epoch": 1.47, "learning_rate": 1.756713913416424e-06, "logits/chosen": -1.8289777040481567, "logits/rejected": -1.757225513458252, "logps/chosen": -55.256553649902344, "logps/rejected": -43.519012451171875, "loss": 0.1835, "rewards/accuracies": 1.0, "rewards/chosen": 2.8495354652404785, "rewards/margins": 1.0343502759933472, "rewards/rejected": 1.8151851892471313, "step": 6623 }, { "epoch": 1.47, "learning_rate": 1.7553500155358942e-06, "logits/chosen": -2.094348192214966, "logits/rejected": -2.094576597213745, "logps/chosen": -57.25159454345703, "logps/rejected": -40.04768371582031, "loss": 0.0794, "rewards/accuracies": 1.0, "rewards/chosen": 5.163856029510498, "rewards/margins": 2.9540419578552246, "rewards/rejected": 2.2098140716552734, "step": 6624 }, { "epoch": 1.47, "learning_rate": 1.7539865345850875e-06, "logits/chosen": -2.0915262699127197, "logits/rejected": -2.037353515625, "logps/chosen": -52.04943084716797, "logps/rejected": -38.72814178466797, "loss": 0.4459, "rewards/accuracies": 1.0, "rewards/chosen": 4.802913665771484, "rewards/margins": 1.2598669528961182, "rewards/rejected": 3.543046712875366, "step": 6625 }, { "epoch": 1.47, "learning_rate": 1.7526234707392075e-06, "logits/chosen": -2.22234845161438, "logits/rejected": -2.229548692703247, "logps/chosen": -51.998863220214844, "logps/rejected": -54.372344970703125, "loss": 0.3384, "rewards/accuracies": 1.0, "rewards/chosen": 5.866611480712891, "rewards/margins": 0.046506404876708984, "rewards/rejected": 5.820105075836182, "step": 6626 }, { "epoch": 1.47, "learning_rate": 1.751260824173406e-06, "logits/chosen": -2.1609394550323486, "logits/rejected": -2.126791477203369, "logps/chosen": -68.61248779296875, "logps/rejected": -77.95764923095703, "loss": 1.454, "rewards/accuracies": 1.0, "rewards/chosen": 4.3772454261779785, "rewards/margins": 2.0995986461639404, "rewards/rejected": 2.277646780014038, "step": 6627 }, { "epoch": 1.47, "learning_rate": 1.7498985950627794e-06, "logits/chosen": -1.956763505935669, "logits/rejected": -1.9544646739959717, "logps/chosen": -55.24074172973633, "logps/rejected": -89.062744140625, "loss": 0.1868, "rewards/accuracies": 1.0, "rewards/chosen": 5.78963041305542, "rewards/margins": 3.228882074356079, "rewards/rejected": 2.560748338699341, "step": 6628 }, { "epoch": 1.47, "learning_rate": 1.748536783582368e-06, "logits/chosen": -1.947235107421875, "logits/rejected": -1.971456527709961, "logps/chosen": -90.61045837402344, "logps/rejected": -74.52714538574219, "loss": 0.0933, "rewards/accuracies": 1.0, "rewards/chosen": 7.196794033050537, "rewards/margins": 1.710069179534912, "rewards/rejected": 5.486724853515625, "step": 6629 }, { "epoch": 1.47, "learning_rate": 1.7471753899071663e-06, "logits/chosen": -1.8698885440826416, "logits/rejected": -1.820427656173706, "logps/chosen": -53.92433166503906, "logps/rejected": -44.85693359375, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": 4.67772912979126, "rewards/margins": 2.6025795936584473, "rewards/rejected": 2.0751495361328125, "step": 6630 }, { "epoch": 1.47, "learning_rate": 1.7458144142121058e-06, "logits/chosen": -2.219743490219116, "logits/rejected": -2.252917528152466, "logps/chosen": -32.799476623535156, "logps/rejected": -111.99893188476562, "loss": 1.1894, "rewards/accuracies": 0.0, "rewards/chosen": 4.091163635253906, "rewards/margins": -2.118068218231201, "rewards/rejected": 6.209231853485107, "step": 6631 }, { "epoch": 1.47, "learning_rate": 1.7444538566720703e-06, "logits/chosen": -2.1760075092315674, "logits/rejected": -2.1884701251983643, "logps/chosen": -34.28651428222656, "logps/rejected": -56.34419250488281, "loss": 0.5622, "rewards/accuracies": 0.0, "rewards/chosen": 4.49627685546875, "rewards/margins": -0.3119010925292969, "rewards/rejected": 4.808177947998047, "step": 6632 }, { "epoch": 1.47, "learning_rate": 1.7430937174618884e-06, "logits/chosen": -1.9944932460784912, "logits/rejected": -1.9954562187194824, "logps/chosen": -61.18568801879883, "logps/rejected": -158.05020141601562, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": 7.95259428024292, "rewards/margins": 2.187356948852539, "rewards/rejected": 5.765237331390381, "step": 6633 }, { "epoch": 1.47, "learning_rate": 1.741733996756334e-06, "logits/chosen": -2.2555556297302246, "logits/rejected": -2.2435970306396484, "logps/chosen": -87.88844299316406, "logps/rejected": -32.983497619628906, "loss": 0.3146, "rewards/accuracies": 1.0, "rewards/chosen": 9.868197441101074, "rewards/margins": 9.162800788879395, "rewards/rejected": 0.705396294593811, "step": 6634 }, { "epoch": 1.47, "learning_rate": 1.7403746947301286e-06, "logits/chosen": -2.090747833251953, "logits/rejected": -2.0316529273986816, "logps/chosen": -95.44151306152344, "logps/rejected": -45.40997314453125, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 6.041606426239014, "rewards/margins": 2.9391214847564697, "rewards/rejected": 3.102484941482544, "step": 6635 }, { "epoch": 1.47, "learning_rate": 1.7390158115579386e-06, "logits/chosen": -1.9770535230636597, "logits/rejected": -1.9713345766067505, "logps/chosen": -35.995208740234375, "logps/rejected": -47.25209045410156, "loss": 0.2527, "rewards/accuracies": 1.0, "rewards/chosen": 3.6581008434295654, "rewards/margins": 0.5591063499450684, "rewards/rejected": 3.098994493484497, "step": 6636 }, { "epoch": 1.47, "learning_rate": 1.7376573474143798e-06, "logits/chosen": -2.0787339210510254, "logits/rejected": -1.6784411668777466, "logps/chosen": -31.41934585571289, "logps/rejected": -81.87130737304688, "loss": 0.7185, "rewards/accuracies": 0.0, "rewards/chosen": 4.905898571014404, "rewards/margins": -1.159024715423584, "rewards/rejected": 6.064923286437988, "step": 6637 }, { "epoch": 1.47, "learning_rate": 1.736299302474006e-06, "logits/chosen": -1.9603992700576782, "logits/rejected": -1.9615947008132935, "logps/chosen": -78.45453643798828, "logps/rejected": -69.53030395507812, "loss": 0.6143, "rewards/accuracies": 0.0, "rewards/chosen": 2.2613365650177, "rewards/margins": -0.4696953296661377, "rewards/rejected": 2.731031894683838, "step": 6638 }, { "epoch": 1.47, "learning_rate": 1.7349416769113292e-06, "logits/chosen": -1.833349585533142, "logits/rejected": -1.8284361362457275, "logps/chosen": -41.262535095214844, "logps/rejected": -37.27482986450195, "loss": 0.3553, "rewards/accuracies": 1.0, "rewards/chosen": 4.088120460510254, "rewards/margins": 0.3509633541107178, "rewards/rejected": 3.737157106399536, "step": 6639 }, { "epoch": 1.47, "learning_rate": 1.7335844709007948e-06, "logits/chosen": -1.5400190353393555, "logits/rejected": -1.4895179271697998, "logps/chosen": -27.441017150878906, "logps/rejected": -13.352032661437988, "loss": 0.3572, "rewards/accuracies": 0.0, "rewards/chosen": 2.2910423278808594, "rewards/margins": -0.03493499755859375, "rewards/rejected": 2.325977325439453, "step": 6640 }, { "epoch": 1.47, "learning_rate": 1.7322276846168068e-06, "logits/chosen": -1.8707449436187744, "logits/rejected": -1.845339059829712, "logps/chosen": -61.45975875854492, "logps/rejected": -35.60219192504883, "loss": 0.1503, "rewards/accuracies": 1.0, "rewards/chosen": 3.602675199508667, "rewards/margins": 1.2554411888122559, "rewards/rejected": 2.347234010696411, "step": 6641 }, { "epoch": 1.47, "learning_rate": 1.7308713182337044e-06, "logits/chosen": -1.7555983066558838, "logits/rejected": -1.7371734380722046, "logps/chosen": -37.695396423339844, "logps/rejected": -56.53564453125, "loss": 0.4465, "rewards/accuracies": 0.0, "rewards/chosen": 4.594099521636963, "rewards/margins": -0.33081674575805664, "rewards/rejected": 4.9249162673950195, "step": 6642 }, { "epoch": 1.47, "learning_rate": 1.729515371925779e-06, "logits/chosen": -1.8782002925872803, "logits/rejected": -1.8395344018936157, "logps/chosen": -57.59446716308594, "logps/rejected": -57.89634704589844, "loss": 0.6249, "rewards/accuracies": 0.0, "rewards/chosen": 2.45896315574646, "rewards/margins": -0.13120174407958984, "rewards/rejected": 2.59016489982605, "step": 6643 }, { "epoch": 1.47, "learning_rate": 1.728159845867266e-06, "logits/chosen": -2.184481620788574, "logits/rejected": -2.179933786392212, "logps/chosen": -82.55523681640625, "logps/rejected": -44.341468811035156, "loss": 0.5217, "rewards/accuracies": 0.0, "rewards/chosen": 4.556729316711426, "rewards/margins": -0.5580282211303711, "rewards/rejected": 5.114757537841797, "step": 6644 }, { "epoch": 1.47, "learning_rate": 1.7268047402323472e-06, "logits/chosen": -1.85872220993042, "logits/rejected": -1.8167660236358643, "logps/chosen": -31.297870635986328, "logps/rejected": -41.5653076171875, "loss": 0.1399, "rewards/accuracies": 1.0, "rewards/chosen": 4.359583854675293, "rewards/margins": 1.15641450881958, "rewards/rejected": 3.203169345855713, "step": 6645 }, { "epoch": 1.47, "learning_rate": 1.725450055195153e-06, "logits/chosen": -2.247551202774048, "logits/rejected": -1.8974274396896362, "logps/chosen": -49.20244598388672, "logps/rejected": -143.98785400390625, "loss": 0.1014, "rewards/accuracies": 1.0, "rewards/chosen": 8.296377182006836, "rewards/margins": 1.7286357879638672, "rewards/rejected": 6.567741394042969, "step": 6646 }, { "epoch": 1.47, "learning_rate": 1.7240957909297511e-06, "logits/chosen": -2.3262643814086914, "logits/rejected": -2.25030779838562, "logps/chosen": -98.03744506835938, "logps/rejected": -39.623931884765625, "loss": 0.5505, "rewards/accuracies": 1.0, "rewards/chosen": 6.873162746429443, "rewards/margins": 5.8889312744140625, "rewards/rejected": 0.9842315912246704, "step": 6647 }, { "epoch": 1.47, "learning_rate": 1.7227419476101687e-06, "logits/chosen": -2.01149845123291, "logits/rejected": -2.0119307041168213, "logps/chosen": -35.08595275878906, "logps/rejected": -35.894107818603516, "loss": 1.3105, "rewards/accuracies": 1.0, "rewards/chosen": 2.1686744689941406, "rewards/margins": 1.0196353197097778, "rewards/rejected": 1.1490391492843628, "step": 6648 }, { "epoch": 1.47, "learning_rate": 1.721388525410364e-06, "logits/chosen": -2.0454087257385254, "logits/rejected": -1.9518160820007324, "logps/chosen": -46.11677551269531, "logps/rejected": -149.77130126953125, "loss": 0.7737, "rewards/accuracies": 0.0, "rewards/chosen": 4.7308173179626465, "rewards/margins": -0.9821681976318359, "rewards/rejected": 5.712985515594482, "step": 6649 }, { "epoch": 1.47, "learning_rate": 1.720035524504256e-06, "logits/chosen": -2.172974109649658, "logits/rejected": -2.208686351776123, "logps/chosen": -120.32992553710938, "logps/rejected": -135.99008178710938, "loss": 0.1331, "rewards/accuracies": 1.0, "rewards/chosen": 11.535791397094727, "rewards/margins": 3.8773012161254883, "rewards/rejected": 7.658490180969238, "step": 6650 }, { "epoch": 1.47, "learning_rate": 1.7186829450656961e-06, "logits/chosen": -1.9171013832092285, "logits/rejected": -1.9698797464370728, "logps/chosen": -100.37342834472656, "logps/rejected": -133.34786987304688, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": 10.662050247192383, "rewards/margins": 4.570408821105957, "rewards/rejected": 6.091641426086426, "step": 6651 }, { "epoch": 1.47, "learning_rate": 1.7173307872684902e-06, "logits/chosen": -2.048680305480957, "logits/rejected": -2.0386767387390137, "logps/chosen": -35.5493278503418, "logps/rejected": -49.941688537597656, "loss": 0.191, "rewards/accuracies": 1.0, "rewards/chosen": 3.5681636333465576, "rewards/margins": 1.8875033855438232, "rewards/rejected": 1.6806602478027344, "step": 6652 }, { "epoch": 1.47, "learning_rate": 1.7159790512863866e-06, "logits/chosen": -2.09861421585083, "logits/rejected": -2.067678689956665, "logps/chosen": -91.46533203125, "logps/rejected": -25.991886138916016, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": 6.210606575012207, "rewards/margins": 4.202505588531494, "rewards/rejected": 2.008100986480713, "step": 6653 }, { "epoch": 1.47, "learning_rate": 1.7146277372930804e-06, "logits/chosen": -1.8916513919830322, "logits/rejected": -1.938425064086914, "logps/chosen": -62.79267883300781, "logps/rejected": -94.25932312011719, "loss": 0.4352, "rewards/accuracies": 0.0, "rewards/chosen": 8.81537914276123, "rewards/margins": -0.19652748107910156, "rewards/rejected": 9.011906623840332, "step": 6654 }, { "epoch": 1.47, "learning_rate": 1.7132768454622133e-06, "logits/chosen": -2.0170845985412598, "logits/rejected": -1.9788527488708496, "logps/chosen": -51.012332916259766, "logps/rejected": -59.80655288696289, "loss": 0.0646, "rewards/accuracies": 1.0, "rewards/chosen": 4.9785380363464355, "rewards/margins": 2.0324881076812744, "rewards/rejected": 2.946049928665161, "step": 6655 }, { "epoch": 1.47, "learning_rate": 1.7119263759673677e-06, "logits/chosen": -1.8827359676361084, "logits/rejected": -1.7482157945632935, "logps/chosen": -79.69987487792969, "logps/rejected": -75.79174041748047, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": 9.966325759887695, "rewards/margins": 2.3339791297912598, "rewards/rejected": 7.6323466300964355, "step": 6656 }, { "epoch": 1.47, "learning_rate": 1.710576328982082e-06, "logits/chosen": -2.1125781536102295, "logits/rejected": -1.947174072265625, "logps/chosen": -85.47039794921875, "logps/rejected": -19.186830520629883, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": 6.711981296539307, "rewards/margins": 6.08425760269165, "rewards/rejected": 0.6277236938476562, "step": 6657 }, { "epoch": 1.47, "learning_rate": 1.7092267046798267e-06, "logits/chosen": -1.8527127504348755, "logits/rejected": -1.904543399810791, "logps/chosen": -67.66036987304688, "logps/rejected": -94.24859619140625, "loss": 2.3464, "rewards/accuracies": 0.0, "rewards/chosen": 2.673452138900757, "rewards/margins": -4.626665115356445, "rewards/rejected": 7.300117492675781, "step": 6658 }, { "epoch": 1.47, "learning_rate": 1.7078775032340329e-06, "logits/chosen": -1.9629199504852295, "logits/rejected": -1.8906261920928955, "logps/chosen": -76.23252868652344, "logps/rejected": -98.67615509033203, "loss": 0.2223, "rewards/accuracies": 1.0, "rewards/chosen": 10.50982666015625, "rewards/margins": 0.6107645034790039, "rewards/rejected": 9.899062156677246, "step": 6659 }, { "epoch": 1.47, "learning_rate": 1.7065287248180635e-06, "logits/chosen": -1.976171612739563, "logits/rejected": -2.0226800441741943, "logps/chosen": -36.51469802856445, "logps/rejected": -57.8268928527832, "loss": 1.4793, "rewards/accuracies": 0.0, "rewards/chosen": 3.0565342903137207, "rewards/margins": -2.1558189392089844, "rewards/rejected": 5.212353229522705, "step": 6660 }, { "epoch": 1.47, "learning_rate": 1.705180369605236e-06, "logits/chosen": -1.905798077583313, "logits/rejected": -1.7953517436981201, "logps/chosen": -78.91310119628906, "logps/rejected": -46.244384765625, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 8.443129539489746, "rewards/margins": 3.166102886199951, "rewards/rejected": 5.277026653289795, "step": 6661 }, { "epoch": 1.47, "learning_rate": 1.7038324377688104e-06, "logits/chosen": -2.155592441558838, "logits/rejected": -2.1413211822509766, "logps/chosen": -42.0299072265625, "logps/rejected": -67.34806823730469, "loss": 0.135, "rewards/accuracies": 1.0, "rewards/chosen": 3.480120897293091, "rewards/margins": 1.1732177734375, "rewards/rejected": 2.306903123855591, "step": 6662 }, { "epoch": 1.47, "learning_rate": 1.7024849294819927e-06, "logits/chosen": -1.9012315273284912, "logits/rejected": -1.753750205039978, "logps/chosen": -199.75894165039062, "logps/rejected": -54.224456787109375, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 5.937930583953857, "rewards/margins": 2.837695598602295, "rewards/rejected": 3.1002349853515625, "step": 6663 }, { "epoch": 1.47, "learning_rate": 1.7011378449179366e-06, "logits/chosen": -2.0908379554748535, "logits/rejected": -2.0494329929351807, "logps/chosen": -119.69552612304688, "logps/rejected": -47.95440673828125, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 7.232354640960693, "rewards/margins": 4.637229919433594, "rewards/rejected": 2.5951249599456787, "step": 6664 }, { "epoch": 1.48, "learning_rate": 1.699791184249734e-06, "logits/chosen": -1.8246079683303833, "logits/rejected": -1.8591352701187134, "logps/chosen": -39.57426834106445, "logps/rejected": -90.46792602539062, "loss": 0.2123, "rewards/accuracies": 1.0, "rewards/chosen": 5.145470142364502, "rewards/margins": 0.6415395736694336, "rewards/rejected": 4.503930568695068, "step": 6665 }, { "epoch": 1.48, "learning_rate": 1.6984449476504334e-06, "logits/chosen": -1.6072486639022827, "logits/rejected": -1.6007381677627563, "logps/chosen": -39.2077751159668, "logps/rejected": -43.46463394165039, "loss": 0.7984, "rewards/accuracies": 0.0, "rewards/chosen": 1.439982295036316, "rewards/margins": -1.3609062433242798, "rewards/rejected": 2.8008885383605957, "step": 6666 }, { "epoch": 1.48, "learning_rate": 1.6970991352930172e-06, "logits/chosen": -1.7808114290237427, "logits/rejected": -1.799475908279419, "logps/chosen": -53.718994140625, "logps/rejected": -81.63919067382812, "loss": 0.2057, "rewards/accuracies": 1.0, "rewards/chosen": 3.9185211658477783, "rewards/margins": 0.7068450450897217, "rewards/rejected": 3.2116761207580566, "step": 6667 }, { "epoch": 1.48, "learning_rate": 1.695753747350426e-06, "logits/chosen": -2.0654990673065186, "logits/rejected": -2.0556952953338623, "logps/chosen": -129.63711547851562, "logps/rejected": -62.38672637939453, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 7.529748439788818, "rewards/margins": 3.8646368980407715, "rewards/rejected": 3.665111541748047, "step": 6668 }, { "epoch": 1.48, "learning_rate": 1.694408783995533e-06, "logits/chosen": -2.1544387340545654, "logits/rejected": -2.037461519241333, "logps/chosen": -84.97920227050781, "logps/rejected": -24.777652740478516, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": 4.075644016265869, "rewards/margins": 3.277318239212036, "rewards/rejected": 0.7983257174491882, "step": 6669 }, { "epoch": 1.48, "learning_rate": 1.6930642454011647e-06, "logits/chosen": -1.7436450719833374, "logits/rejected": -1.723252773284912, "logps/chosen": -40.89032745361328, "logps/rejected": -32.246543884277344, "loss": 0.1637, "rewards/accuracies": 1.0, "rewards/chosen": 3.183020830154419, "rewards/margins": 0.9509353637695312, "rewards/rejected": 2.2320854663848877, "step": 6670 }, { "epoch": 1.48, "learning_rate": 1.6917201317400916e-06, "logits/chosen": -2.300781011581421, "logits/rejected": -2.238071918487549, "logps/chosen": -68.71893310546875, "logps/rejected": -24.770450592041016, "loss": 0.1373, "rewards/accuracies": 1.0, "rewards/chosen": 3.9423890113830566, "rewards/margins": 3.230468511581421, "rewards/rejected": 0.7119205594062805, "step": 6671 }, { "epoch": 1.48, "learning_rate": 1.6903764431850284e-06, "logits/chosen": -2.047589063644409, "logits/rejected": -2.0081045627593994, "logps/chosen": -97.0343017578125, "logps/rejected": -57.891700744628906, "loss": 0.0705, "rewards/accuracies": 1.0, "rewards/chosen": 7.528720378875732, "rewards/margins": 4.208523750305176, "rewards/rejected": 3.3201966285705566, "step": 6672 }, { "epoch": 1.48, "learning_rate": 1.6890331799086379e-06, "logits/chosen": -2.3407816886901855, "logits/rejected": -2.3180980682373047, "logps/chosen": -87.23686981201172, "logps/rejected": -81.64380645751953, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": 5.784952640533447, "rewards/margins": 3.374037981033325, "rewards/rejected": 2.410914659500122, "step": 6673 }, { "epoch": 1.48, "learning_rate": 1.6876903420835205e-06, "logits/chosen": -1.944226622581482, "logits/rejected": -1.8064937591552734, "logps/chosen": -101.37965393066406, "logps/rejected": -72.88326263427734, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 7.039218425750732, "rewards/margins": 4.76143741607666, "rewards/rejected": 2.2777810096740723, "step": 6674 }, { "epoch": 1.48, "learning_rate": 1.686347929882236e-06, "logits/chosen": -1.9436345100402832, "logits/rejected": -1.5066581964492798, "logps/chosen": -8.093058586120605, "logps/rejected": -160.22792053222656, "loss": 3.2663, "rewards/accuracies": 0.0, "rewards/chosen": 0.08604001998901367, "rewards/margins": -6.361550807952881, "rewards/rejected": 6.4475908279418945, "step": 6675 }, { "epoch": 1.48, "learning_rate": 1.6850059434772726e-06, "logits/chosen": -1.8381606340408325, "logits/rejected": -1.8669211864471436, "logps/chosen": -89.79714965820312, "logps/rejected": -39.12863540649414, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 11.582343101501465, "rewards/margins": 6.604588985443115, "rewards/rejected": 4.97775411605835, "step": 6676 }, { "epoch": 1.48, "learning_rate": 1.6836643830410798e-06, "logits/chosen": -1.661190390586853, "logits/rejected": -1.6822212934494019, "logps/chosen": -82.31141662597656, "logps/rejected": -63.841148376464844, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 8.740965843200684, "rewards/margins": 5.288715839385986, "rewards/rejected": 3.4522500038146973, "step": 6677 }, { "epoch": 1.48, "learning_rate": 1.68232324874604e-06, "logits/chosen": -2.0097036361694336, "logits/rejected": -1.9978331327438354, "logps/chosen": -21.918745040893555, "logps/rejected": -48.86237716674805, "loss": 0.8258, "rewards/accuracies": 0.0, "rewards/chosen": 3.567763566970825, "rewards/margins": -0.900559663772583, "rewards/rejected": 4.468323230743408, "step": 6678 }, { "epoch": 1.48, "learning_rate": 1.6809825407644875e-06, "logits/chosen": -1.925726056098938, "logits/rejected": -1.4526177644729614, "logps/chosen": -84.83049011230469, "logps/rejected": -65.04554748535156, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 7.800473213195801, "rewards/margins": 2.9989142417907715, "rewards/rejected": 4.801558971405029, "step": 6679 }, { "epoch": 1.48, "learning_rate": 1.6796422592686995e-06, "logits/chosen": -2.115774631500244, "logits/rejected": -2.1309006214141846, "logps/chosen": -47.9763298034668, "logps/rejected": -52.487396240234375, "loss": 0.8402, "rewards/accuracies": 1.0, "rewards/chosen": 3.5870823860168457, "rewards/margins": 1.7197155952453613, "rewards/rejected": 1.8673667907714844, "step": 6680 }, { "epoch": 1.48, "learning_rate": 1.6783024044308998e-06, "logits/chosen": -1.9451812505722046, "logits/rejected": -1.9220041036605835, "logps/chosen": -148.84234619140625, "logps/rejected": -49.1851806640625, "loss": 0.0622, "rewards/accuracies": 1.0, "rewards/chosen": 6.877662658691406, "rewards/margins": 2.884756326675415, "rewards/rejected": 3.992906332015991, "step": 6681 }, { "epoch": 1.48, "learning_rate": 1.676962976423257e-06, "logits/chosen": -2.097332715988159, "logits/rejected": -1.899175763130188, "logps/chosen": -51.475460052490234, "logps/rejected": -64.71122741699219, "loss": 0.3587, "rewards/accuracies": 1.0, "rewards/chosen": 5.060225486755371, "rewards/margins": 0.8179082870483398, "rewards/rejected": 4.242317199707031, "step": 6682 }, { "epoch": 1.48, "learning_rate": 1.6756239754178805e-06, "logits/chosen": -1.8820863962173462, "logits/rejected": -1.813852310180664, "logps/chosen": -96.72200012207031, "logps/rejected": -83.40943145751953, "loss": 0.1924, "rewards/accuracies": 1.0, "rewards/chosen": 7.375120639801025, "rewards/margins": 2.201326847076416, "rewards/rejected": 5.173793792724609, "step": 6683 }, { "epoch": 1.48, "learning_rate": 1.6742854015868349e-06, "logits/chosen": -2.1048471927642822, "logits/rejected": -2.08994197845459, "logps/chosen": -39.621192932128906, "logps/rejected": -17.576618194580078, "loss": 0.2283, "rewards/accuracies": 1.0, "rewards/chosen": 4.936087131500244, "rewards/margins": 2.6401021480560303, "rewards/rejected": 2.295984983444214, "step": 6684 }, { "epoch": 1.48, "learning_rate": 1.6729472551021182e-06, "logits/chosen": -1.9325743913650513, "logits/rejected": -1.9101974964141846, "logps/chosen": -41.81642150878906, "logps/rejected": -28.664714813232422, "loss": 0.6187, "rewards/accuracies": 0.0, "rewards/chosen": 3.2496659755706787, "rewards/margins": -0.8075611591339111, "rewards/rejected": 4.05722713470459, "step": 6685 }, { "epoch": 1.48, "learning_rate": 1.6716095361356816e-06, "logits/chosen": -1.9421075582504272, "logits/rejected": -1.8470983505249023, "logps/chosen": -162.12322998046875, "logps/rejected": -11.863529205322266, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": 4.724223613739014, "rewards/margins": 4.291628837585449, "rewards/rejected": 0.4325948655605316, "step": 6686 }, { "epoch": 1.48, "learning_rate": 1.6702722448594183e-06, "logits/chosen": -1.8886867761611938, "logits/rejected": -1.8240594863891602, "logps/chosen": -97.40634155273438, "logps/rejected": -58.434776306152344, "loss": 0.1904, "rewards/accuracies": 1.0, "rewards/chosen": 6.320098876953125, "rewards/margins": 0.843167781829834, "rewards/rejected": 5.476931095123291, "step": 6687 }, { "epoch": 1.48, "learning_rate": 1.668935381445167e-06, "logits/chosen": -1.8940683603286743, "logits/rejected": -1.7157701253890991, "logps/chosen": -145.29148864746094, "logps/rejected": -56.877193450927734, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 9.01996898651123, "rewards/margins": 6.856546401977539, "rewards/rejected": 2.1634228229522705, "step": 6688 }, { "epoch": 1.48, "learning_rate": 1.6675989460647117e-06, "logits/chosen": -2.0114920139312744, "logits/rejected": -2.031013250350952, "logps/chosen": -75.70136260986328, "logps/rejected": -42.264549255371094, "loss": 0.3523, "rewards/accuracies": 1.0, "rewards/chosen": 4.084600925445557, "rewards/margins": 0.06904315948486328, "rewards/rejected": 4.015557765960693, "step": 6689 }, { "epoch": 1.48, "learning_rate": 1.666262938889781e-06, "logits/chosen": -1.9027173519134521, "logits/rejected": -1.9027173519134521, "logps/chosen": -48.62803649902344, "logps/rejected": -48.62803649902344, "loss": 0.3487, "rewards/accuracies": 0.0, "rewards/chosen": 4.885039806365967, "rewards/margins": 0.0, "rewards/rejected": 4.885039806365967, "step": 6690 }, { "epoch": 1.48, "learning_rate": 1.6649273600920484e-06, "logits/chosen": -2.055677652359009, "logits/rejected": -2.0301456451416016, "logps/chosen": -120.6460189819336, "logps/rejected": -107.88371276855469, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": 6.723905086517334, "rewards/margins": 1.6842918395996094, "rewards/rejected": 5.039613246917725, "step": 6691 }, { "epoch": 1.48, "learning_rate": 1.6635922098431328e-06, "logits/chosen": -1.9733972549438477, "logits/rejected": -1.9447249174118042, "logps/chosen": -48.776458740234375, "logps/rejected": -57.044776916503906, "loss": 0.1428, "rewards/accuracies": 1.0, "rewards/chosen": 3.8781661987304688, "rewards/margins": 1.4607124328613281, "rewards/rejected": 2.4174537658691406, "step": 6692 }, { "epoch": 1.48, "learning_rate": 1.6622574883145998e-06, "logits/chosen": -1.8101342916488647, "logits/rejected": -1.7718478441238403, "logps/chosen": -57.128814697265625, "logps/rejected": -45.919349670410156, "loss": 0.1198, "rewards/accuracies": 1.0, "rewards/chosen": 4.016132354736328, "rewards/margins": 1.338771104812622, "rewards/rejected": 2.677361249923706, "step": 6693 }, { "epoch": 1.48, "learning_rate": 1.6609231956779538e-06, "logits/chosen": -2.1040914058685303, "logits/rejected": -2.0845513343811035, "logps/chosen": -29.856773376464844, "logps/rejected": -60.23019790649414, "loss": 0.2311, "rewards/accuracies": 1.0, "rewards/chosen": 2.940730333328247, "rewards/margins": 0.5562701225280762, "rewards/rejected": 2.384460210800171, "step": 6694 }, { "epoch": 1.48, "learning_rate": 1.6595893321046503e-06, "logits/chosen": -1.9604825973510742, "logits/rejected": -1.9583919048309326, "logps/chosen": -23.1390323638916, "logps/rejected": -48.8759880065918, "loss": 0.1828, "rewards/accuracies": 1.0, "rewards/chosen": 3.844862461090088, "rewards/margins": 0.8247325420379639, "rewards/rejected": 3.020129919052124, "step": 6695 }, { "epoch": 1.48, "learning_rate": 1.6582558977660874e-06, "logits/chosen": -2.059819459915161, "logits/rejected": -2.059819459915161, "logps/chosen": -44.78948211669922, "logps/rejected": -44.78948211669922, "loss": 1.0289, "rewards/accuracies": 0.0, "rewards/chosen": 2.8021538257598877, "rewards/margins": 0.0, "rewards/rejected": 2.8021538257598877, "step": 6696 }, { "epoch": 1.48, "learning_rate": 1.656922892833608e-06, "logits/chosen": -1.8694469928741455, "logits/rejected": -1.8389222621917725, "logps/chosen": -17.084640502929688, "logps/rejected": -12.823531150817871, "loss": 0.4062, "rewards/accuracies": 1.0, "rewards/chosen": 2.5720341205596924, "rewards/margins": 2.0660481452941895, "rewards/rejected": 0.5059859156608582, "step": 6697 }, { "epoch": 1.48, "learning_rate": 1.655590317478501e-06, "logits/chosen": -1.7317873239517212, "logits/rejected": -1.7171190977096558, "logps/chosen": -21.372364044189453, "logps/rejected": -25.90271759033203, "loss": 0.4185, "rewards/accuracies": 0.0, "rewards/chosen": 0.9294613003730774, "rewards/margins": -0.17353779077529907, "rewards/rejected": 1.1029990911483765, "step": 6698 }, { "epoch": 1.48, "learning_rate": 1.654258171871998e-06, "logits/chosen": -2.0176310539245605, "logits/rejected": -2.0244078636169434, "logps/chosen": -36.55136489868164, "logps/rejected": -126.52511596679688, "loss": 0.3594, "rewards/accuracies": 0.0, "rewards/chosen": 7.595069885253906, "rewards/margins": -0.04313039779663086, "rewards/rejected": 7.638200283050537, "step": 6699 }, { "epoch": 1.48, "learning_rate": 1.652926456185277e-06, "logits/chosen": -2.0666592121124268, "logits/rejected": -2.0297515392303467, "logps/chosen": -33.31201934814453, "logps/rejected": -68.91967010498047, "loss": 0.1779, "rewards/accuracies": 1.0, "rewards/chosen": 5.289031982421875, "rewards/margins": 2.2484474182128906, "rewards/rejected": 3.0405845642089844, "step": 6700 }, { "epoch": 1.48, "learning_rate": 1.65159517058946e-06, "logits/chosen": -1.955780029296875, "logits/rejected": -1.944919228553772, "logps/chosen": -120.44983673095703, "logps/rejected": -135.69606018066406, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 10.377141952514648, "rewards/margins": 2.247767448425293, "rewards/rejected": 8.129374504089355, "step": 6701 }, { "epoch": 1.48, "learning_rate": 1.6502643152556148e-06, "logits/chosen": -2.086785316467285, "logits/rejected": -1.915514588356018, "logps/chosen": -160.85073852539062, "logps/rejected": -57.383155822753906, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 6.146514892578125, "rewards/margins": 6.382326602935791, "rewards/rejected": -0.23581162095069885, "step": 6702 }, { "epoch": 1.48, "learning_rate": 1.6489338903547542e-06, "logits/chosen": -1.9251209497451782, "logits/rejected": -1.4710161685943604, "logps/chosen": -70.18641662597656, "logps/rejected": -55.770423889160156, "loss": 0.3103, "rewards/accuracies": 1.0, "rewards/chosen": 3.7693824768066406, "rewards/margins": 1.675065517425537, "rewards/rejected": 2.0943169593811035, "step": 6703 }, { "epoch": 1.48, "learning_rate": 1.647603896057831e-06, "logits/chosen": -2.01666259765625, "logits/rejected": -1.998473048210144, "logps/chosen": -49.491661071777344, "logps/rejected": -64.56826782226562, "loss": 0.2549, "rewards/accuracies": 1.0, "rewards/chosen": 4.100832462310791, "rewards/margins": 0.8370950222015381, "rewards/rejected": 3.263737440109253, "step": 6704 }, { "epoch": 1.48, "learning_rate": 1.6462743325357482e-06, "logits/chosen": -1.7624471187591553, "logits/rejected": -1.7603633403778076, "logps/chosen": -109.88664245605469, "logps/rejected": -95.28515625, "loss": 0.1817, "rewards/accuracies": 1.0, "rewards/chosen": 6.347761631011963, "rewards/margins": 0.8312149047851562, "rewards/rejected": 5.516546726226807, "step": 6705 }, { "epoch": 1.48, "learning_rate": 1.644945199959352e-06, "logits/chosen": -1.954230546951294, "logits/rejected": -1.9745382070541382, "logps/chosen": -34.47307586669922, "logps/rejected": -81.87876892089844, "loss": 0.2203, "rewards/accuracies": 1.0, "rewards/chosen": 3.534153699874878, "rewards/margins": 0.6855056285858154, "rewards/rejected": 2.8486480712890625, "step": 6706 }, { "epoch": 1.48, "learning_rate": 1.6436164984994317e-06, "logits/chosen": -1.6501821279525757, "logits/rejected": -1.7613012790679932, "logps/chosen": -11.45999526977539, "logps/rejected": -46.568817138671875, "loss": 1.9812, "rewards/accuracies": 0.0, "rewards/chosen": 1.6620851755142212, "rewards/margins": -3.922593116760254, "rewards/rejected": 5.5846781730651855, "step": 6707 }, { "epoch": 1.48, "learning_rate": 1.6422882283267228e-06, "logits/chosen": -1.622694969177246, "logits/rejected": -1.63015615940094, "logps/chosen": -63.942665100097656, "logps/rejected": -138.27914428710938, "loss": 1.6766, "rewards/accuracies": 1.0, "rewards/chosen": 8.057045936584473, "rewards/margins": 1.1467995643615723, "rewards/rejected": 6.9102463722229, "step": 6708 }, { "epoch": 1.48, "learning_rate": 1.6409603896119053e-06, "logits/chosen": -2.1128711700439453, "logits/rejected": -1.9809951782226562, "logps/chosen": -44.4588737487793, "logps/rejected": -211.16944885253906, "loss": 2.0291, "rewards/accuracies": 0.0, "rewards/chosen": 5.122758865356445, "rewards/margins": -0.9429564476013184, "rewards/rejected": 6.065715312957764, "step": 6709 }, { "epoch": 1.49, "learning_rate": 1.6396329825256025e-06, "logits/chosen": -1.787500023841858, "logits/rejected": -1.7720450162887573, "logps/chosen": -45.84270477294922, "logps/rejected": -73.52333068847656, "loss": 0.101, "rewards/accuracies": 1.0, "rewards/chosen": 4.5273566246032715, "rewards/margins": 1.7394018173217773, "rewards/rejected": 2.787954807281494, "step": 6710 }, { "epoch": 1.49, "learning_rate": 1.6383060072383834e-06, "logits/chosen": -1.9499436616897583, "logits/rejected": -1.8419322967529297, "logps/chosen": -112.9239501953125, "logps/rejected": -56.01091384887695, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 6.036881923675537, "rewards/margins": 3.711876153945923, "rewards/rejected": 2.3250057697296143, "step": 6711 }, { "epoch": 1.49, "learning_rate": 1.6369794639207626e-06, "logits/chosen": -1.6116578578948975, "logits/rejected": -1.6116578578948975, "logps/chosen": -57.161949157714844, "logps/rejected": -57.161949157714844, "loss": 0.3552, "rewards/accuracies": 0.0, "rewards/chosen": 5.378028392791748, "rewards/margins": 0.0, "rewards/rejected": 5.378028392791748, "step": 6712 }, { "epoch": 1.49, "learning_rate": 1.635653352743195e-06, "logits/chosen": -2.0121636390686035, "logits/rejected": -1.8638650178909302, "logps/chosen": -92.29005432128906, "logps/rejected": -11.489300727844238, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 9.442970275878906, "rewards/margins": 8.499357223510742, "rewards/rejected": 0.9436133503913879, "step": 6713 }, { "epoch": 1.49, "learning_rate": 1.6343276738760837e-06, "logits/chosen": -1.9944576025009155, "logits/rejected": -2.009345769882202, "logps/chosen": -43.572303771972656, "logps/rejected": -95.60688781738281, "loss": 0.2465, "rewards/accuracies": 1.0, "rewards/chosen": 5.442163944244385, "rewards/margins": 0.5078024864196777, "rewards/rejected": 4.934361457824707, "step": 6714 }, { "epoch": 1.49, "learning_rate": 1.6330024274897755e-06, "logits/chosen": -2.1232354640960693, "logits/rejected": -2.133646011352539, "logps/chosen": -15.849455833435059, "logps/rejected": -52.97203063964844, "loss": 1.0559, "rewards/accuracies": 0.0, "rewards/chosen": 1.2715667486190796, "rewards/margins": -1.941513180732727, "rewards/rejected": 3.2130799293518066, "step": 6715 }, { "epoch": 1.49, "learning_rate": 1.6316776137545609e-06, "logits/chosen": -1.817604899406433, "logits/rejected": -1.794770359992981, "logps/chosen": -42.86934280395508, "logps/rejected": -67.95252227783203, "loss": 0.1565, "rewards/accuracies": 1.0, "rewards/chosen": 3.788904905319214, "rewards/margins": 1.420419692993164, "rewards/rejected": 2.36848521232605, "step": 6716 }, { "epoch": 1.49, "learning_rate": 1.6303532328406762e-06, "logits/chosen": -1.999630331993103, "logits/rejected": -1.941397786140442, "logps/chosen": -33.012168884277344, "logps/rejected": -64.21694946289062, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": 5.0957865715026855, "rewards/margins": 2.7276437282562256, "rewards/rejected": 2.36814284324646, "step": 6717 }, { "epoch": 1.49, "learning_rate": 1.6290292849183003e-06, "logits/chosen": -1.839581847190857, "logits/rejected": -1.852408766746521, "logps/chosen": -136.0004119873047, "logps/rejected": -152.76873779296875, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 10.313311576843262, "rewards/margins": 2.2913999557495117, "rewards/rejected": 8.02191162109375, "step": 6718 }, { "epoch": 1.49, "learning_rate": 1.6277057701575584e-06, "logits/chosen": -1.8366646766662598, "logits/rejected": -1.7959866523742676, "logps/chosen": -43.75361251831055, "logps/rejected": -39.70149612426758, "loss": 0.2686, "rewards/accuracies": 1.0, "rewards/chosen": 3.641022205352783, "rewards/margins": 0.42870402336120605, "rewards/rejected": 3.212318181991577, "step": 6719 }, { "epoch": 1.49, "learning_rate": 1.6263826887285173e-06, "logits/chosen": -1.9778670072555542, "logits/rejected": -2.0037806034088135, "logps/chosen": -34.128082275390625, "logps/rejected": -80.22343444824219, "loss": 0.2204, "rewards/accuracies": 1.0, "rewards/chosen": 5.67495059967041, "rewards/margins": 0.7939095497131348, "rewards/rejected": 4.881041049957275, "step": 6720 }, { "epoch": 1.49, "learning_rate": 1.625060040801193e-06, "logits/chosen": -1.8374046087265015, "logits/rejected": -1.8365061283111572, "logps/chosen": -72.10769653320312, "logps/rejected": -63.80483627319336, "loss": 0.868, "rewards/accuracies": 1.0, "rewards/chosen": 4.058366298675537, "rewards/margins": 1.492643117904663, "rewards/rejected": 2.565723180770874, "step": 6721 }, { "epoch": 1.49, "learning_rate": 1.6237378265455367e-06, "logits/chosen": -1.5478241443634033, "logits/rejected": -1.5478241443634033, "logps/chosen": -12.527996063232422, "logps/rejected": -12.527996063232422, "loss": 0.3556, "rewards/accuracies": 0.0, "rewards/chosen": 3.45391583442688, "rewards/margins": 0.0, "rewards/rejected": 3.45391583442688, "step": 6722 }, { "epoch": 1.49, "learning_rate": 1.622416046131457e-06, "logits/chosen": -1.8198343515396118, "logits/rejected": -1.7316423654556274, "logps/chosen": -86.8775634765625, "logps/rejected": -50.13981628417969, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": 4.581259250640869, "rewards/margins": 2.130545139312744, "rewards/rejected": 2.450714111328125, "step": 6723 }, { "epoch": 1.49, "learning_rate": 1.621094699728794e-06, "logits/chosen": -1.7590023279190063, "logits/rejected": -1.778026819229126, "logps/chosen": -48.99534225463867, "logps/rejected": -64.50251007080078, "loss": 0.3657, "rewards/accuracies": 1.0, "rewards/chosen": 5.099907398223877, "rewards/margins": 2.717249631881714, "rewards/rejected": 2.382657766342163, "step": 6724 }, { "epoch": 1.49, "learning_rate": 1.61977378750734e-06, "logits/chosen": -2.0147511959075928, "logits/rejected": -2.0740127563476562, "logps/chosen": -37.67771911621094, "logps/rejected": -99.11114501953125, "loss": 0.4743, "rewards/accuracies": 0.0, "rewards/chosen": 10.7019681930542, "rewards/margins": -0.37259578704833984, "rewards/rejected": 11.074563980102539, "step": 6725 }, { "epoch": 1.49, "learning_rate": 1.6184533096368277e-06, "logits/chosen": -1.7089959383010864, "logits/rejected": -1.7089959383010864, "logps/chosen": -35.83805847167969, "logps/rejected": -35.83805847167969, "loss": 0.5105, "rewards/accuracies": 0.0, "rewards/chosen": 2.6609275341033936, "rewards/margins": 0.0, "rewards/rejected": 2.6609275341033936, "step": 6726 }, { "epoch": 1.49, "learning_rate": 1.6171332662869366e-06, "logits/chosen": -1.9831656217575073, "logits/rejected": -1.9289742708206177, "logps/chosen": -115.30839538574219, "logps/rejected": -55.41496658325195, "loss": 0.1136, "rewards/accuracies": 1.0, "rewards/chosen": 5.71920919418335, "rewards/margins": 2.6600711345672607, "rewards/rejected": 3.059138059616089, "step": 6727 }, { "epoch": 1.49, "learning_rate": 1.6158136576272882e-06, "logits/chosen": -1.7799805402755737, "logits/rejected": -1.7799805402755737, "logps/chosen": -1.531424880027771, "logps/rejected": -1.531424880027771, "loss": 0.8789, "rewards/accuracies": 0.0, "rewards/chosen": 1.3380985260009766, "rewards/margins": 0.0, "rewards/rejected": 1.3380985260009766, "step": 6728 }, { "epoch": 1.49, "learning_rate": 1.6144944838274495e-06, "logits/chosen": -2.2074527740478516, "logits/rejected": -2.185401201248169, "logps/chosen": -23.725269317626953, "logps/rejected": -42.17521667480469, "loss": 0.2408, "rewards/accuracies": 1.0, "rewards/chosen": 3.1279408931732178, "rewards/margins": 0.924433708190918, "rewards/rejected": 2.2035071849823, "step": 6729 }, { "epoch": 1.49, "learning_rate": 1.613175745056933e-06, "logits/chosen": -2.047555446624756, "logits/rejected": -1.96083402633667, "logps/chosen": -106.83404541015625, "logps/rejected": -37.18456268310547, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": 5.62349271774292, "rewards/margins": 2.9054605960845947, "rewards/rejected": 2.718032121658325, "step": 6730 }, { "epoch": 1.49, "learning_rate": 1.6118574414851884e-06, "logits/chosen": -1.9854495525360107, "logits/rejected": -1.900185227394104, "logps/chosen": -40.85812759399414, "logps/rejected": -22.535518646240234, "loss": 0.1033, "rewards/accuracies": 1.0, "rewards/chosen": 5.362755298614502, "rewards/margins": 4.315557479858398, "rewards/rejected": 1.047197699546814, "step": 6731 }, { "epoch": 1.49, "learning_rate": 1.6105395732816204e-06, "logits/chosen": -2.0284409523010254, "logits/rejected": -2.0292341709136963, "logps/chosen": -71.66665649414062, "logps/rejected": -75.07707977294922, "loss": 0.2426, "rewards/accuracies": 1.0, "rewards/chosen": 4.4323649406433105, "rewards/margins": 0.701756477355957, "rewards/rejected": 3.7306084632873535, "step": 6732 }, { "epoch": 1.49, "learning_rate": 1.6092221406155679e-06, "logits/chosen": -1.9020494222640991, "logits/rejected": -1.8620959520339966, "logps/chosen": -30.302810668945312, "logps/rejected": -71.7891845703125, "loss": 0.0938, "rewards/accuracies": 1.0, "rewards/chosen": 3.385911703109741, "rewards/margins": 1.6536790132522583, "rewards/rejected": 1.732232689857483, "step": 6733 }, { "epoch": 1.49, "learning_rate": 1.6079051436563193e-06, "logits/chosen": -2.025313138961792, "logits/rejected": -1.860093593597412, "logps/chosen": -153.82164001464844, "logps/rejected": -68.32789611816406, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 8.704548835754395, "rewards/margins": 6.8796892166137695, "rewards/rejected": 1.824859619140625, "step": 6734 }, { "epoch": 1.49, "learning_rate": 1.6065885825731047e-06, "logits/chosen": -1.7305091619491577, "logits/rejected": -1.711889386177063, "logps/chosen": -24.65569496154785, "logps/rejected": -23.95524787902832, "loss": 0.297, "rewards/accuracies": 1.0, "rewards/chosen": 2.577803611755371, "rewards/margins": 0.8603988885879517, "rewards/rejected": 1.7174047231674194, "step": 6735 }, { "epoch": 1.49, "learning_rate": 1.6052724575351004e-06, "logits/chosen": -2.0377228260040283, "logits/rejected": -1.988958477973938, "logps/chosen": -39.04669189453125, "logps/rejected": -59.9901123046875, "loss": 0.1168, "rewards/accuracies": 1.0, "rewards/chosen": 2.8762948513031006, "rewards/margins": 1.4176576137542725, "rewards/rejected": 1.4586372375488281, "step": 6736 }, { "epoch": 1.49, "learning_rate": 1.603956768711426e-06, "logits/chosen": -1.780020833015442, "logits/rejected": -1.7462903261184692, "logps/chosen": -50.387603759765625, "logps/rejected": -85.506591796875, "loss": 0.0732, "rewards/accuracies": 1.0, "rewards/chosen": 4.799971103668213, "rewards/margins": 2.015638828277588, "rewards/rejected": 2.784332275390625, "step": 6737 }, { "epoch": 1.49, "learning_rate": 1.60264151627114e-06, "logits/chosen": -1.6833431720733643, "logits/rejected": -1.6654691696166992, "logps/chosen": -59.384735107421875, "logps/rejected": -58.8538818359375, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": 6.761414527893066, "rewards/margins": 1.4001588821411133, "rewards/rejected": 5.361255645751953, "step": 6738 }, { "epoch": 1.49, "learning_rate": 1.601326700383255e-06, "logits/chosen": -2.0596814155578613, "logits/rejected": -2.026353120803833, "logps/chosen": -154.586181640625, "logps/rejected": -110.92729187011719, "loss": 0.0579, "rewards/accuracies": 1.0, "rewards/chosen": 8.089251518249512, "rewards/margins": 2.0988874435424805, "rewards/rejected": 5.990364074707031, "step": 6739 }, { "epoch": 1.49, "learning_rate": 1.6000123212167158e-06, "logits/chosen": -2.005638599395752, "logits/rejected": -1.9477235078811646, "logps/chosen": -59.56175231933594, "logps/rejected": -87.38508605957031, "loss": 0.3787, "rewards/accuracies": 1.0, "rewards/chosen": 3.578355550765991, "rewards/margins": 0.7230033874511719, "rewards/rejected": 2.8553521633148193, "step": 6740 }, { "epoch": 1.49, "learning_rate": 1.598698378940423e-06, "logits/chosen": -2.3141393661499023, "logits/rejected": -2.3309824466705322, "logps/chosen": -114.26434326171875, "logps/rejected": -74.13640594482422, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 6.409216403961182, "rewards/margins": 4.515682220458984, "rewards/rejected": 1.8935340642929077, "step": 6741 }, { "epoch": 1.49, "learning_rate": 1.597384873723209e-06, "logits/chosen": -1.8224973678588867, "logits/rejected": -1.8224973678588867, "logps/chosen": -10.662997245788574, "logps/rejected": -10.662997245788574, "loss": 0.3719, "rewards/accuracies": 0.0, "rewards/chosen": 3.2432618141174316, "rewards/margins": 0.0, "rewards/rejected": 3.2432618141174316, "step": 6742 }, { "epoch": 1.49, "learning_rate": 1.596071805733862e-06, "logits/chosen": -1.642930030822754, "logits/rejected": -1.6239516735076904, "logps/chosen": -50.35441207885742, "logps/rejected": -74.46114349365234, "loss": 0.9134, "rewards/accuracies": 1.0, "rewards/chosen": 3.723682165145874, "rewards/margins": 1.2850873470306396, "rewards/rejected": 2.4385948181152344, "step": 6743 }, { "epoch": 1.49, "learning_rate": 1.5947591751411034e-06, "logits/chosen": -1.883463978767395, "logits/rejected": -1.8719135522842407, "logps/chosen": -54.188926696777344, "logps/rejected": -76.81014251708984, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": 6.0099358558654785, "rewards/margins": 3.6740434169769287, "rewards/rejected": 2.33589243888855, "step": 6744 }, { "epoch": 1.49, "learning_rate": 1.5934469821136056e-06, "logits/chosen": -1.7466349601745605, "logits/rejected": -1.7139105796813965, "logps/chosen": -59.25291442871094, "logps/rejected": -60.983131408691406, "loss": 0.1488, "rewards/accuracies": 1.0, "rewards/chosen": 2.7247512340545654, "rewards/margins": 1.2012534141540527, "rewards/rejected": 1.5234978199005127, "step": 6745 }, { "epoch": 1.49, "learning_rate": 1.5921352268199824e-06, "logits/chosen": -1.924513339996338, "logits/rejected": -1.8901216983795166, "logps/chosen": -88.98834228515625, "logps/rejected": -53.757240295410156, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": 5.643929958343506, "rewards/margins": 2.3604772090911865, "rewards/rejected": 3.2834527492523193, "step": 6746 }, { "epoch": 1.49, "learning_rate": 1.590823909428787e-06, "logits/chosen": -1.9716894626617432, "logits/rejected": -1.828240990638733, "logps/chosen": -211.33714294433594, "logps/rejected": -83.72218322753906, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 8.75517749786377, "rewards/margins": 5.047752380371094, "rewards/rejected": 3.7074248790740967, "step": 6747 }, { "epoch": 1.49, "learning_rate": 1.5895130301085276e-06, "logits/chosen": -2.0635080337524414, "logits/rejected": -2.0433812141418457, "logps/chosen": -136.84442138671875, "logps/rejected": -64.23819732666016, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": 5.0489959716796875, "rewards/margins": 3.387014865875244, "rewards/rejected": 1.661981225013733, "step": 6748 }, { "epoch": 1.49, "learning_rate": 1.5882025890276415e-06, "logits/chosen": -1.8746644258499146, "logits/rejected": -1.8898032903671265, "logps/chosen": -58.63026428222656, "logps/rejected": -73.97415924072266, "loss": 0.3119, "rewards/accuracies": 1.0, "rewards/chosen": 4.937455654144287, "rewards/margins": 0.1713566780090332, "rewards/rejected": 4.766098976135254, "step": 6749 }, { "epoch": 1.49, "learning_rate": 1.5868925863545242e-06, "logits/chosen": -1.9804279804229736, "logits/rejected": -1.9517558813095093, "logps/chosen": -40.7154655456543, "logps/rejected": -22.26706886291504, "loss": 0.382, "rewards/accuracies": 1.0, "rewards/chosen": 2.9462878704071045, "rewards/margins": 0.2539684772491455, "rewards/rejected": 2.692319393157959, "step": 6750 }, { "epoch": 1.49, "learning_rate": 1.585583022257502e-06, "logits/chosen": -1.7991641759872437, "logits/rejected": -1.7179285287857056, "logps/chosen": -99.99776458740234, "logps/rejected": -56.958343505859375, "loss": 0.0697, "rewards/accuracies": 1.0, "rewards/chosen": 8.76895809173584, "rewards/margins": 4.801435470581055, "rewards/rejected": 3.967522382736206, "step": 6751 }, { "epoch": 1.49, "learning_rate": 1.584273896904856e-06, "logits/chosen": -1.9235327243804932, "logits/rejected": -1.789320707321167, "logps/chosen": -51.17534637451172, "logps/rejected": -24.41606903076172, "loss": 0.6162, "rewards/accuracies": 1.0, "rewards/chosen": 2.8595588207244873, "rewards/margins": 1.3588969707489014, "rewards/rejected": 1.500661849975586, "step": 6752 }, { "epoch": 1.49, "learning_rate": 1.5829652104648018e-06, "logits/chosen": -1.660073161125183, "logits/rejected": -1.6050044298171997, "logps/chosen": -66.65908813476562, "logps/rejected": -77.83848571777344, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 5.946412086486816, "rewards/margins": 2.9298219680786133, "rewards/rejected": 3.016590118408203, "step": 6753 }, { "epoch": 1.49, "learning_rate": 1.581656963105504e-06, "logits/chosen": -2.0593907833099365, "logits/rejected": -2.04716157913208, "logps/chosen": -16.364702224731445, "logps/rejected": -56.9915771484375, "loss": 1.5861, "rewards/accuracies": 0.0, "rewards/chosen": 2.695751428604126, "rewards/margins": -2.420464277267456, "rewards/rejected": 5.116215705871582, "step": 6754 }, { "epoch": 1.5, "learning_rate": 1.58034915499507e-06, "logits/chosen": -2.0618019104003906, "logits/rejected": -1.974068284034729, "logps/chosen": -44.45967102050781, "logps/rejected": -22.258487701416016, "loss": 0.1245, "rewards/accuracies": 1.0, "rewards/chosen": 3.3375070095062256, "rewards/margins": 2.394904136657715, "rewards/rejected": 0.9426027536392212, "step": 6755 }, { "epoch": 1.5, "learning_rate": 1.579041786301546e-06, "logits/chosen": -1.8678314685821533, "logits/rejected": -1.8113335371017456, "logps/chosen": -21.83329963684082, "logps/rejected": -19.912139892578125, "loss": 0.2113, "rewards/accuracies": 1.0, "rewards/chosen": 1.5439329147338867, "rewards/margins": 0.6693605184555054, "rewards/rejected": 0.8745723962783813, "step": 6756 }, { "epoch": 1.5, "learning_rate": 1.577734857192932e-06, "logits/chosen": -2.022146701812744, "logits/rejected": -2.0132522583007812, "logps/chosen": -54.53111267089844, "logps/rejected": -67.4024429321289, "loss": 0.209, "rewards/accuracies": 1.0, "rewards/chosen": 6.051072597503662, "rewards/margins": 0.8855862617492676, "rewards/rejected": 5.1654863357543945, "step": 6757 }, { "epoch": 1.5, "learning_rate": 1.5764283678371588e-06, "logits/chosen": -1.8787299394607544, "logits/rejected": -1.3392986059188843, "logps/chosen": -34.043052673339844, "logps/rejected": -116.3114013671875, "loss": 0.6564, "rewards/accuracies": 0.0, "rewards/chosen": 2.9085845947265625, "rewards/margins": -0.7891068458557129, "rewards/rejected": 3.6976914405822754, "step": 6758 }, { "epoch": 1.5, "learning_rate": 1.575122318402113e-06, "logits/chosen": -2.0029704570770264, "logits/rejected": -1.9818836450576782, "logps/chosen": -52.649620056152344, "logps/rejected": -69.83964538574219, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": 5.640872955322266, "rewards/margins": 2.904825448989868, "rewards/rejected": 2.7360475063323975, "step": 6759 }, { "epoch": 1.5, "learning_rate": 1.573816709055615e-06, "logits/chosen": -1.9696708917617798, "logits/rejected": -1.9771233797073364, "logps/chosen": -36.725074768066406, "logps/rejected": -35.39680480957031, "loss": 0.4252, "rewards/accuracies": 1.0, "rewards/chosen": 2.9487175941467285, "rewards/margins": 0.28449368476867676, "rewards/rejected": 2.6642239093780518, "step": 6760 }, { "epoch": 1.5, "learning_rate": 1.5725115399654333e-06, "logits/chosen": -1.7712717056274414, "logits/rejected": -1.748843789100647, "logps/chosen": -31.672977447509766, "logps/rejected": -33.874393463134766, "loss": 0.4621, "rewards/accuracies": 1.0, "rewards/chosen": 3.3466382026672363, "rewards/margins": 1.3469620943069458, "rewards/rejected": 1.9996761083602905, "step": 6761 }, { "epoch": 1.5, "learning_rate": 1.5712068112992795e-06, "logits/chosen": -1.795270562171936, "logits/rejected": -1.6991357803344727, "logps/chosen": -70.43842315673828, "logps/rejected": -53.50621795654297, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": 3.170428514480591, "rewards/margins": 2.2135231494903564, "rewards/rejected": 0.9569053649902344, "step": 6762 }, { "epoch": 1.5, "learning_rate": 1.5699025232248078e-06, "logits/chosen": -2.209740161895752, "logits/rejected": -2.2097482681274414, "logps/chosen": -71.69412231445312, "logps/rejected": -66.56851959228516, "loss": 0.3041, "rewards/accuracies": 1.0, "rewards/chosen": 5.451404571533203, "rewards/margins": 1.033886432647705, "rewards/rejected": 4.417518138885498, "step": 6763 }, { "epoch": 1.5, "learning_rate": 1.5685986759096177e-06, "logits/chosen": -2.2003445625305176, "logits/rejected": -2.165168523788452, "logps/chosen": -122.87031555175781, "logps/rejected": -94.11280822753906, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 7.630932807922363, "rewards/margins": 3.081944465637207, "rewards/rejected": 4.548988342285156, "step": 6764 }, { "epoch": 1.5, "learning_rate": 1.5672952695212458e-06, "logits/chosen": -1.9217181205749512, "logits/rejected": -1.931856632232666, "logps/chosen": -58.71839904785156, "logps/rejected": -73.3240966796875, "loss": 0.5009, "rewards/accuracies": 0.0, "rewards/chosen": 3.2774276733398438, "rewards/margins": -0.2824692726135254, "rewards/rejected": 3.559896945953369, "step": 6765 }, { "epoch": 1.5, "learning_rate": 1.5659923042271829e-06, "logits/chosen": -1.8420518636703491, "logits/rejected": -1.8605701923370361, "logps/chosen": -153.13937377929688, "logps/rejected": -96.96614074707031, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 7.947244167327881, "rewards/margins": 4.612876892089844, "rewards/rejected": 3.334367513656616, "step": 6766 }, { "epoch": 1.5, "learning_rate": 1.5646897801948502e-06, "logits/chosen": -2.0033538341522217, "logits/rejected": -1.9322582483291626, "logps/chosen": -67.09779357910156, "logps/rejected": -51.05500793457031, "loss": 0.4266, "rewards/accuracies": 1.0, "rewards/chosen": 5.50139856338501, "rewards/margins": 0.6666831970214844, "rewards/rejected": 4.834715366363525, "step": 6767 }, { "epoch": 1.5, "learning_rate": 1.5633876975916261e-06, "logits/chosen": -1.8416372537612915, "logits/rejected": -1.7721043825149536, "logps/chosen": -45.22199249267578, "logps/rejected": -11.199738502502441, "loss": 0.2025, "rewards/accuracies": 1.0, "rewards/chosen": 1.7311912775039673, "rewards/margins": 0.7112780809402466, "rewards/rejected": 1.0199131965637207, "step": 6768 }, { "epoch": 1.5, "learning_rate": 1.56208605658482e-06, "logits/chosen": -1.8332127332687378, "logits/rejected": -1.8660858869552612, "logps/chosen": -35.5779914855957, "logps/rejected": -50.267311096191406, "loss": 1.4486, "rewards/accuracies": 0.0, "rewards/chosen": 3.6722638607025146, "rewards/margins": -2.473619222640991, "rewards/rejected": 6.145883083343506, "step": 6769 }, { "epoch": 1.5, "learning_rate": 1.5607848573416905e-06, "logits/chosen": -1.853079080581665, "logits/rejected": -1.853079080581665, "logps/chosen": -29.505451202392578, "logps/rejected": -29.505451202392578, "loss": 0.3477, "rewards/accuracies": 0.0, "rewards/chosen": 3.531583070755005, "rewards/margins": 0.0, "rewards/rejected": 3.531583070755005, "step": 6770 }, { "epoch": 1.5, "learning_rate": 1.5594841000294397e-06, "logits/chosen": -1.7804646492004395, "logits/rejected": -1.7797534465789795, "logps/chosen": -30.571704864501953, "logps/rejected": -42.1142578125, "loss": 0.5213, "rewards/accuracies": 1.0, "rewards/chosen": 3.0823826789855957, "rewards/margins": 0.3878200054168701, "rewards/rejected": 2.6945626735687256, "step": 6771 }, { "epoch": 1.5, "learning_rate": 1.5581837848152114e-06, "logits/chosen": -2.003375291824341, "logits/rejected": -1.9086871147155762, "logps/chosen": -49.057464599609375, "logps/rejected": -23.153772354125977, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": 3.3921096324920654, "rewards/margins": 2.8416497707366943, "rewards/rejected": 0.5504598617553711, "step": 6772 }, { "epoch": 1.5, "learning_rate": 1.5568839118660927e-06, "logits/chosen": -2.019904136657715, "logits/rejected": -1.9830220937728882, "logps/chosen": -90.25267028808594, "logps/rejected": -106.50959777832031, "loss": 0.223, "rewards/accuracies": 1.0, "rewards/chosen": 8.665239334106445, "rewards/margins": 0.7341008186340332, "rewards/rejected": 7.931138515472412, "step": 6773 }, { "epoch": 1.5, "learning_rate": 1.5555844813491139e-06, "logits/chosen": -2.092254400253296, "logits/rejected": -1.7997385263442993, "logps/chosen": -26.293601989746094, "logps/rejected": -79.81608581542969, "loss": 0.158, "rewards/accuracies": 1.0, "rewards/chosen": 5.465662479400635, "rewards/margins": 1.239682674407959, "rewards/rejected": 4.225979804992676, "step": 6774 }, { "epoch": 1.5, "learning_rate": 1.5542854934312512e-06, "logits/chosen": -1.915980339050293, "logits/rejected": -1.953798770904541, "logps/chosen": -37.052974700927734, "logps/rejected": -80.49067687988281, "loss": 0.0756, "rewards/accuracies": 1.0, "rewards/chosen": 3.428786039352417, "rewards/margins": 1.8232333660125732, "rewards/rejected": 1.6055526733398438, "step": 6775 }, { "epoch": 1.5, "learning_rate": 1.5529869482794157e-06, "logits/chosen": -1.987115740776062, "logits/rejected": -1.8722056150436401, "logps/chosen": -122.70601654052734, "logps/rejected": -36.507049560546875, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 6.7544121742248535, "rewards/margins": 3.655348300933838, "rewards/rejected": 3.0990638732910156, "step": 6776 }, { "epoch": 1.5, "learning_rate": 1.5516888460604744e-06, "logits/chosen": -1.9425592422485352, "logits/rejected": -1.9253278970718384, "logps/chosen": -46.8438720703125, "logps/rejected": -75.27350616455078, "loss": 0.7483, "rewards/accuracies": 0.0, "rewards/chosen": 3.1107773780822754, "rewards/margins": -0.49677205085754395, "rewards/rejected": 3.6075494289398193, "step": 6777 }, { "epoch": 1.5, "learning_rate": 1.5503911869412258e-06, "logits/chosen": -2.044724225997925, "logits/rejected": -1.947222113609314, "logps/chosen": -97.42457580566406, "logps/rejected": -40.428123474121094, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 6.708763122558594, "rewards/margins": 3.831496477127075, "rewards/rejected": 2.8772666454315186, "step": 6778 }, { "epoch": 1.5, "learning_rate": 1.5490939710884162e-06, "logits/chosen": -2.1035633087158203, "logits/rejected": -2.0567073822021484, "logps/chosen": -39.79343795776367, "logps/rejected": -71.75796508789062, "loss": 0.2388, "rewards/accuracies": 1.0, "rewards/chosen": 3.226255416870117, "rewards/margins": 1.1791751384735107, "rewards/rejected": 2.0470802783966064, "step": 6779 }, { "epoch": 1.5, "learning_rate": 1.5477971986687361e-06, "logits/chosen": -1.7775352001190186, "logits/rejected": -1.884946346282959, "logps/chosen": -57.43120574951172, "logps/rejected": -119.40489196777344, "loss": 3.6842, "rewards/accuracies": 0.0, "rewards/chosen": 3.6874656677246094, "rewards/margins": -7.352397918701172, "rewards/rejected": 11.039863586425781, "step": 6780 }, { "epoch": 1.5, "learning_rate": 1.5465008698488166e-06, "logits/chosen": -1.9170069694519043, "logits/rejected": -1.9158427715301514, "logps/chosen": -55.64155197143555, "logps/rejected": -61.743812561035156, "loss": 0.238, "rewards/accuracies": 1.0, "rewards/chosen": 3.401045560836792, "rewards/margins": 0.7167530059814453, "rewards/rejected": 2.6842925548553467, "step": 6781 }, { "epoch": 1.5, "learning_rate": 1.5452049847952338e-06, "logits/chosen": -1.867066740989685, "logits/rejected": -1.7710837125778198, "logps/chosen": -95.2130126953125, "logps/rejected": -62.54559326171875, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": 6.298699855804443, "rewards/margins": 3.7596449851989746, "rewards/rejected": 2.5390548706054688, "step": 6782 }, { "epoch": 1.5, "learning_rate": 1.543909543674505e-06, "logits/chosen": -1.9977372884750366, "logits/rejected": -1.9939411878585815, "logps/chosen": -27.728282928466797, "logps/rejected": -57.722808837890625, "loss": 0.1629, "rewards/accuracies": 1.0, "rewards/chosen": 2.9443893432617188, "rewards/margins": 0.9700499773025513, "rewards/rejected": 1.9743393659591675, "step": 6783 }, { "epoch": 1.5, "learning_rate": 1.5426145466530934e-06, "logits/chosen": -1.7878113985061646, "logits/rejected": -1.7434784173965454, "logps/chosen": -53.07438278198242, "logps/rejected": -38.58103942871094, "loss": 0.4073, "rewards/accuracies": 0.0, "rewards/chosen": 3.544386625289917, "rewards/margins": -0.009271621704101562, "rewards/rejected": 3.5536582469940186, "step": 6784 }, { "epoch": 1.5, "learning_rate": 1.5413199938973983e-06, "logits/chosen": -1.8129229545593262, "logits/rejected": -1.7131214141845703, "logps/chosen": -80.20018005371094, "logps/rejected": -56.89829635620117, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 6.240348815917969, "rewards/margins": 2.368394136428833, "rewards/rejected": 3.8719546794891357, "step": 6785 }, { "epoch": 1.5, "learning_rate": 1.5400258855737726e-06, "logits/chosen": -1.7528488636016846, "logits/rejected": -1.7528488636016846, "logps/chosen": -34.12496566772461, "logps/rejected": -34.12496566772461, "loss": 0.3483, "rewards/accuracies": 0.0, "rewards/chosen": 3.8338704109191895, "rewards/margins": 0.0, "rewards/rejected": 3.8338704109191895, "step": 6786 }, { "epoch": 1.5, "learning_rate": 1.5387322218485011e-06, "logits/chosen": -1.8090019226074219, "logits/rejected": -1.7427946329116821, "logps/chosen": -52.54677200317383, "logps/rejected": -23.44222640991211, "loss": 0.1571, "rewards/accuracies": 1.0, "rewards/chosen": 2.3468525409698486, "rewards/margins": 1.0037147998809814, "rewards/rejected": 1.3431377410888672, "step": 6787 }, { "epoch": 1.5, "learning_rate": 1.537439002887819e-06, "logits/chosen": -1.9406925439834595, "logits/rejected": -1.8527631759643555, "logps/chosen": -119.27944946289062, "logps/rejected": -49.739131927490234, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 6.997703552246094, "rewards/margins": 3.9266109466552734, "rewards/rejected": 3.0710926055908203, "step": 6788 }, { "epoch": 1.5, "learning_rate": 1.536146228857901e-06, "logits/chosen": -2.125988483428955, "logits/rejected": -2.034832000732422, "logps/chosen": -50.21388626098633, "logps/rejected": -21.770069122314453, "loss": 0.1907, "rewards/accuracies": 1.0, "rewards/chosen": 2.0630033016204834, "rewards/margins": 1.2132179737091064, "rewards/rejected": 0.8497852683067322, "step": 6789 }, { "epoch": 1.5, "learning_rate": 1.5348538999248652e-06, "logits/chosen": -2.081716775894165, "logits/rejected": -2.027599573135376, "logps/chosen": -55.36576843261719, "logps/rejected": -12.098175048828125, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": 4.053152561187744, "rewards/margins": 3.315063953399658, "rewards/rejected": 0.7380886077880859, "step": 6790 }, { "epoch": 1.5, "learning_rate": 1.5335620162547738e-06, "logits/chosen": -2.1078271865844727, "logits/rejected": -2.080373764038086, "logps/chosen": -84.10580444335938, "logps/rejected": -80.23558044433594, "loss": 0.2331, "rewards/accuracies": 1.0, "rewards/chosen": 6.083953857421875, "rewards/margins": 0.5617203712463379, "rewards/rejected": 5.522233486175537, "step": 6791 }, { "epoch": 1.5, "learning_rate": 1.5322705780136304e-06, "logits/chosen": -1.990140676498413, "logits/rejected": -1.9935855865478516, "logps/chosen": -23.57783317565918, "logps/rejected": -51.484413146972656, "loss": 0.7741, "rewards/accuracies": 0.0, "rewards/chosen": 3.4644057750701904, "rewards/margins": -0.3013343811035156, "rewards/rejected": 3.765740156173706, "step": 6792 }, { "epoch": 1.5, "learning_rate": 1.5309795853673815e-06, "logits/chosen": -1.9402482509613037, "logits/rejected": -1.9897066354751587, "logps/chosen": -13.891160011291504, "logps/rejected": -52.997047424316406, "loss": 0.621, "rewards/accuracies": 0.0, "rewards/chosen": 3.2003300189971924, "rewards/margins": -0.8323428630828857, "rewards/rejected": 4.032672882080078, "step": 6793 }, { "epoch": 1.5, "learning_rate": 1.5296890384819162e-06, "logits/chosen": -1.800487995147705, "logits/rejected": -1.8869340419769287, "logps/chosen": -52.599884033203125, "logps/rejected": -104.57393646240234, "loss": 2.3979, "rewards/accuracies": 0.0, "rewards/chosen": 3.724411725997925, "rewards/margins": -4.7426347732543945, "rewards/rejected": 8.467046737670898, "step": 6794 }, { "epoch": 1.5, "learning_rate": 1.528398937523069e-06, "logits/chosen": -1.9697856903076172, "logits/rejected": -1.9596269130706787, "logps/chosen": -61.36261749267578, "logps/rejected": -135.9046173095703, "loss": 0.0586, "rewards/accuracies": 1.0, "rewards/chosen": 7.06588077545166, "rewards/margins": 2.129554271697998, "rewards/rejected": 4.936326503753662, "step": 6795 }, { "epoch": 1.5, "learning_rate": 1.5271092826566108e-06, "logits/chosen": -1.9439741373062134, "logits/rejected": -1.9366620779037476, "logps/chosen": -41.12795639038086, "logps/rejected": -104.15565490722656, "loss": 0.6293, "rewards/accuracies": 1.0, "rewards/chosen": 4.3941779136657715, "rewards/margins": 0.23376083374023438, "rewards/rejected": 4.160417079925537, "step": 6796 }, { "epoch": 1.5, "learning_rate": 1.5258200740482614e-06, "logits/chosen": -1.9244433641433716, "logits/rejected": -1.9026809930801392, "logps/chosen": -37.57563018798828, "logps/rejected": -37.75661849975586, "loss": 0.2337, "rewards/accuracies": 1.0, "rewards/chosen": 3.618035078048706, "rewards/margins": 1.0061030387878418, "rewards/rejected": 2.6119320392608643, "step": 6797 }, { "epoch": 1.5, "learning_rate": 1.5245313118636811e-06, "logits/chosen": -2.123103141784668, "logits/rejected": -2.0554277896881104, "logps/chosen": -67.63209533691406, "logps/rejected": -53.396240234375, "loss": 0.1781, "rewards/accuracies": 1.0, "rewards/chosen": 7.413357734680176, "rewards/margins": 0.8608851432800293, "rewards/rejected": 6.5524725914001465, "step": 6798 }, { "epoch": 1.5, "learning_rate": 1.523242996268472e-06, "logits/chosen": -1.4680125713348389, "logits/rejected": -1.4680125713348389, "logps/chosen": -13.319191932678223, "logps/rejected": -13.319191932678223, "loss": 0.6387, "rewards/accuracies": 0.0, "rewards/chosen": 0.7004947066307068, "rewards/margins": 0.0, "rewards/rejected": 0.7004947066307068, "step": 6799 }, { "epoch": 1.51, "learning_rate": 1.5219551274281802e-06, "logits/chosen": -2.0441079139709473, "logits/rejected": -1.7485082149505615, "logps/chosen": -33.74472427368164, "logps/rejected": -32.51749801635742, "loss": 0.182, "rewards/accuracies": 1.0, "rewards/chosen": 3.3287580013275146, "rewards/margins": 1.0400803089141846, "rewards/rejected": 2.28867769241333, "step": 6800 }, { "epoch": 1.51, "learning_rate": 1.5206677055082936e-06, "logits/chosen": -2.1463046073913574, "logits/rejected": -2.0689706802368164, "logps/chosen": -72.6418228149414, "logps/rejected": -129.5626983642578, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 8.83377456665039, "rewards/margins": 4.337981224060059, "rewards/rejected": 4.495793342590332, "step": 6801 }, { "epoch": 1.51, "learning_rate": 1.5193807306742425e-06, "logits/chosen": -2.024444341659546, "logits/rejected": -1.9704197645187378, "logps/chosen": -108.61314392089844, "logps/rejected": -311.1866455078125, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 13.735020637512207, "rewards/margins": 2.543977737426758, "rewards/rejected": 11.19104290008545, "step": 6802 }, { "epoch": 1.51, "learning_rate": 1.5180942030914003e-06, "logits/chosen": -1.8587685823440552, "logits/rejected": -1.7998651266098022, "logps/chosen": -60.445281982421875, "logps/rejected": -47.81825256347656, "loss": 0.2309, "rewards/accuracies": 1.0, "rewards/chosen": 3.6393203735351562, "rewards/margins": 1.1223068237304688, "rewards/rejected": 2.5170135498046875, "step": 6803 }, { "epoch": 1.51, "learning_rate": 1.516808122925083e-06, "logits/chosen": -1.5521211624145508, "logits/rejected": -1.0437088012695312, "logps/chosen": -127.58694458007812, "logps/rejected": -48.963558197021484, "loss": 0.062, "rewards/accuracies": 1.0, "rewards/chosen": 5.8792405128479, "rewards/margins": 2.8740406036376953, "rewards/rejected": 3.005199909210205, "step": 6804 }, { "epoch": 1.51, "learning_rate": 1.5155224903405502e-06, "logits/chosen": -1.7371973991394043, "logits/rejected": -1.679337501525879, "logps/chosen": -97.31964111328125, "logps/rejected": -73.94049835205078, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": 5.542799472808838, "rewards/margins": 2.8746421337127686, "rewards/rejected": 2.6681573390960693, "step": 6805 }, { "epoch": 1.51, "learning_rate": 1.514237305502999e-06, "logits/chosen": -1.9137715101242065, "logits/rejected": -1.8637828826904297, "logps/chosen": -29.11713218688965, "logps/rejected": -32.079254150390625, "loss": 0.1392, "rewards/accuracies": 1.0, "rewards/chosen": 2.945189952850342, "rewards/margins": 1.2915865182876587, "rewards/rejected": 1.653603434562683, "step": 6806 }, { "epoch": 1.51, "learning_rate": 1.5129525685775747e-06, "logits/chosen": -1.9901971817016602, "logits/rejected": -1.9995023012161255, "logps/chosen": -20.762359619140625, "logps/rejected": -65.47624206542969, "loss": 0.6746, "rewards/accuracies": 0.0, "rewards/chosen": 3.1603195667266846, "rewards/margins": -0.6459548473358154, "rewards/rejected": 3.8062744140625, "step": 6807 }, { "epoch": 1.51, "learning_rate": 1.511668279729363e-06, "logits/chosen": -2.0007822513580322, "logits/rejected": -2.036022186279297, "logps/chosen": -64.93607330322266, "logps/rejected": -87.52408599853516, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 7.14410924911499, "rewards/margins": 5.052560329437256, "rewards/rejected": 2.0915489196777344, "step": 6808 }, { "epoch": 1.51, "learning_rate": 1.5103844391233913e-06, "logits/chosen": -1.962380051612854, "logits/rejected": -1.939168930053711, "logps/chosen": -46.39433288574219, "logps/rejected": -57.968475341796875, "loss": 0.1803, "rewards/accuracies": 1.0, "rewards/chosen": 4.354795932769775, "rewards/margins": 0.9558854103088379, "rewards/rejected": 3.3989105224609375, "step": 6809 }, { "epoch": 1.51, "learning_rate": 1.5091010469246303e-06, "logits/chosen": -2.0902583599090576, "logits/rejected": -1.9787330627441406, "logps/chosen": -39.354156494140625, "logps/rejected": -33.15778350830078, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": 4.646673679351807, "rewards/margins": 4.080943584442139, "rewards/rejected": 0.5657302737236023, "step": 6810 }, { "epoch": 1.51, "learning_rate": 1.5078181032979933e-06, "logits/chosen": -1.7680268287658691, "logits/rejected": -1.854936122894287, "logps/chosen": -51.44183349609375, "logps/rejected": -109.79389953613281, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": 4.86970853805542, "rewards/margins": 2.337111234664917, "rewards/rejected": 2.532597303390503, "step": 6811 }, { "epoch": 1.51, "learning_rate": 1.506535608408336e-06, "logits/chosen": -1.5563064813613892, "logits/rejected": -1.4702718257904053, "logps/chosen": -47.05622100830078, "logps/rejected": -35.45729064941406, "loss": 0.1386, "rewards/accuracies": 1.0, "rewards/chosen": 1.0530403852462769, "rewards/margins": 1.2171635627746582, "rewards/rejected": -0.16412316262722015, "step": 6812 }, { "epoch": 1.51, "learning_rate": 1.5052535624204522e-06, "logits/chosen": -1.9260265827178955, "logits/rejected": -1.9233405590057373, "logps/chosen": -19.268836975097656, "logps/rejected": -51.842681884765625, "loss": 0.454, "rewards/accuracies": 0.0, "rewards/chosen": 2.591771364212036, "rewards/margins": -0.345170259475708, "rewards/rejected": 2.936941623687744, "step": 6813 }, { "epoch": 1.51, "learning_rate": 1.5039719654990875e-06, "logits/chosen": -1.8980931043624878, "logits/rejected": -1.97365140914917, "logps/chosen": -38.00525665283203, "logps/rejected": -87.34418487548828, "loss": 1.3296, "rewards/accuracies": 0.0, "rewards/chosen": 6.126462459564209, "rewards/margins": -2.5770297050476074, "rewards/rejected": 8.703492164611816, "step": 6814 }, { "epoch": 1.51, "learning_rate": 1.5026908178089195e-06, "logits/chosen": -1.6627570390701294, "logits/rejected": -1.5779839754104614, "logps/chosen": -99.49380493164062, "logps/rejected": -79.36361694335938, "loss": 0.1771, "rewards/accuracies": 1.0, "rewards/chosen": 6.265371799468994, "rewards/margins": 2.456838369369507, "rewards/rejected": 3.8085334300994873, "step": 6815 }, { "epoch": 1.51, "learning_rate": 1.5014101195145741e-06, "logits/chosen": -2.0049896240234375, "logits/rejected": -2.0049896240234375, "logps/chosen": -37.947120666503906, "logps/rejected": -37.947120666503906, "loss": 0.3541, "rewards/accuracies": 0.0, "rewards/chosen": 2.8207390308380127, "rewards/margins": 0.0, "rewards/rejected": 2.8207390308380127, "step": 6816 }, { "epoch": 1.51, "learning_rate": 1.5001298707806183e-06, "logits/chosen": -2.1342215538024902, "logits/rejected": -2.022339344024658, "logps/chosen": -114.82403564453125, "logps/rejected": -49.76389694213867, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": 7.850012302398682, "rewards/margins": 4.550419330596924, "rewards/rejected": 3.299592971801758, "step": 6817 }, { "epoch": 1.51, "learning_rate": 1.4988500717715604e-06, "logits/chosen": -1.8611470460891724, "logits/rejected": -1.7726686000823975, "logps/chosen": -105.93209075927734, "logps/rejected": -94.70069122314453, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": 5.2216925621032715, "rewards/margins": 3.516407012939453, "rewards/rejected": 1.705285668373108, "step": 6818 }, { "epoch": 1.51, "learning_rate": 1.4975707226518515e-06, "logits/chosen": -2.193892240524292, "logits/rejected": -2.1642534732818604, "logps/chosen": -46.570343017578125, "logps/rejected": -48.870452880859375, "loss": 0.0814, "rewards/accuracies": 1.0, "rewards/chosen": 4.3892741203308105, "rewards/margins": 1.7556848526000977, "rewards/rejected": 2.633589267730713, "step": 6819 }, { "epoch": 1.51, "learning_rate": 1.4962918235858853e-06, "logits/chosen": -1.9053932428359985, "logits/rejected": -1.9720628261566162, "logps/chosen": -27.288715362548828, "logps/rejected": -73.17691040039062, "loss": 0.4385, "rewards/accuracies": 0.0, "rewards/chosen": 4.81977653503418, "rewards/margins": -0.2979006767272949, "rewards/rejected": 5.117677211761475, "step": 6820 }, { "epoch": 1.51, "learning_rate": 1.4950133747379985e-06, "logits/chosen": -2.194998025894165, "logits/rejected": -1.7514442205429077, "logps/chosen": -59.493804931640625, "logps/rejected": -56.91889190673828, "loss": 0.1743, "rewards/accuracies": 1.0, "rewards/chosen": 3.745008945465088, "rewards/margins": 0.9285469055175781, "rewards/rejected": 2.8164620399475098, "step": 6821 }, { "epoch": 1.51, "learning_rate": 1.4937353762724649e-06, "logits/chosen": -1.933132290840149, "logits/rejected": -1.933132290840149, "logps/chosen": -26.69764518737793, "logps/rejected": -26.69764518737793, "loss": 0.3554, "rewards/accuracies": 0.0, "rewards/chosen": 4.777684211730957, "rewards/margins": 0.0, "rewards/rejected": 4.777684211730957, "step": 6822 }, { "epoch": 1.51, "learning_rate": 1.492457828353509e-06, "logits/chosen": -1.8556612730026245, "logits/rejected": -1.7895444631576538, "logps/chosen": -41.620216369628906, "logps/rejected": -64.88780212402344, "loss": 0.1518, "rewards/accuracies": 1.0, "rewards/chosen": 4.145328521728516, "rewards/margins": 1.056243896484375, "rewards/rejected": 3.0890846252441406, "step": 6823 }, { "epoch": 1.51, "learning_rate": 1.4911807311452874e-06, "logits/chosen": -1.9185526371002197, "logits/rejected": -1.9607667922973633, "logps/chosen": -60.68463134765625, "logps/rejected": -101.40108489990234, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": 5.315904140472412, "rewards/margins": 1.6887731552124023, "rewards/rejected": 3.6271309852600098, "step": 6824 }, { "epoch": 1.51, "learning_rate": 1.4899040848119096e-06, "logits/chosen": -1.6202263832092285, "logits/rejected": -1.4729535579681396, "logps/chosen": -76.242431640625, "logps/rejected": -44.43705749511719, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 3.2365777492523193, "rewards/margins": 2.860349416732788, "rewards/rejected": 0.37622833251953125, "step": 6825 }, { "epoch": 1.51, "learning_rate": 1.4886278895174178e-06, "logits/chosen": -1.958308219909668, "logits/rejected": -1.9004071950912476, "logps/chosen": -134.08343505859375, "logps/rejected": -85.82018280029297, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": 6.445013523101807, "rewards/margins": 2.6989128589630127, "rewards/rejected": 3.746100664138794, "step": 6826 }, { "epoch": 1.51, "learning_rate": 1.487352145425801e-06, "logits/chosen": -1.7947827577590942, "logits/rejected": -1.7798718214035034, "logps/chosen": -37.32218551635742, "logps/rejected": -52.3724479675293, "loss": 0.4706, "rewards/accuracies": 1.0, "rewards/chosen": 3.606220006942749, "rewards/margins": 1.2671456336975098, "rewards/rejected": 2.3390743732452393, "step": 6827 }, { "epoch": 1.51, "learning_rate": 1.4860768527009894e-06, "logits/chosen": -1.5887190103530884, "logits/rejected": -1.5887190103530884, "logps/chosen": -37.346717834472656, "logps/rejected": -37.346717834472656, "loss": 0.5453, "rewards/accuracies": 0.0, "rewards/chosen": 3.4121696949005127, "rewards/margins": 0.0, "rewards/rejected": 3.4121696949005127, "step": 6828 }, { "epoch": 1.51, "learning_rate": 1.4848020115068545e-06, "logits/chosen": -1.922094702720642, "logits/rejected": -1.5567138195037842, "logps/chosen": -71.17173767089844, "logps/rejected": -103.61042785644531, "loss": 0.9031, "rewards/accuracies": 0.0, "rewards/chosen": 4.2544264793396, "rewards/margins": -1.5748810768127441, "rewards/rejected": 5.829307556152344, "step": 6829 }, { "epoch": 1.51, "learning_rate": 1.483527622007213e-06, "logits/chosen": -2.302168607711792, "logits/rejected": -2.32303786277771, "logps/chosen": -83.66279602050781, "logps/rejected": -93.54594421386719, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": 11.342494010925293, "rewards/margins": 2.273426055908203, "rewards/rejected": 9.06906795501709, "step": 6830 }, { "epoch": 1.51, "learning_rate": 1.4822536843658152e-06, "logits/chosen": -2.3133065700531006, "logits/rejected": -2.247333526611328, "logps/chosen": -120.31343841552734, "logps/rejected": -72.01834106445312, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 6.5425333976745605, "rewards/margins": 2.7521440982818604, "rewards/rejected": 3.7903892993927, "step": 6831 }, { "epoch": 1.51, "learning_rate": 1.4809801987463657e-06, "logits/chosen": -1.9993290901184082, "logits/rejected": -1.929758906364441, "logps/chosen": -32.39445495605469, "logps/rejected": -31.255859375, "loss": 0.1916, "rewards/accuracies": 1.0, "rewards/chosen": 5.323174476623535, "rewards/margins": 0.7631287574768066, "rewards/rejected": 4.5600457191467285, "step": 6832 }, { "epoch": 1.51, "learning_rate": 1.4797071653124985e-06, "logits/chosen": -1.7503701448440552, "logits/rejected": -1.769147276878357, "logps/chosen": -47.291343688964844, "logps/rejected": -66.85671997070312, "loss": 0.5866, "rewards/accuracies": 0.0, "rewards/chosen": 4.265561580657959, "rewards/margins": -0.4270486831665039, "rewards/rejected": 4.692610263824463, "step": 6833 }, { "epoch": 1.51, "learning_rate": 1.4784345842278002e-06, "logits/chosen": -1.906540870666504, "logits/rejected": -1.86745023727417, "logps/chosen": -93.11361694335938, "logps/rejected": -75.54759216308594, "loss": 0.2662, "rewards/accuracies": 1.0, "rewards/chosen": 7.685003757476807, "rewards/margins": 3.9090516567230225, "rewards/rejected": 3.775952100753784, "step": 6834 }, { "epoch": 1.51, "learning_rate": 1.4771624556557912e-06, "logits/chosen": -1.8242619037628174, "logits/rejected": -1.7394832372665405, "logps/chosen": -57.94406509399414, "logps/rejected": -50.14836883544922, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 2.858391284942627, "rewards/margins": 1.9849522113800049, "rewards/rejected": 0.8734390139579773, "step": 6835 }, { "epoch": 1.51, "learning_rate": 1.475890779759938e-06, "logits/chosen": -2.1333444118499756, "logits/rejected": -2.1247718334198, "logps/chosen": -116.19824981689453, "logps/rejected": -77.61164093017578, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 6.575869083404541, "rewards/margins": 2.136805534362793, "rewards/rejected": 4.439063549041748, "step": 6836 }, { "epoch": 1.51, "learning_rate": 1.474619556703648e-06, "logits/chosen": -1.9001059532165527, "logits/rejected": -1.9324003458023071, "logps/chosen": -58.735801696777344, "logps/rejected": -138.21649169921875, "loss": 0.1171, "rewards/accuracies": 1.0, "rewards/chosen": 10.480925559997559, "rewards/margins": 1.8641786575317383, "rewards/rejected": 8.61674690246582, "step": 6837 }, { "epoch": 1.51, "learning_rate": 1.4733487866502698e-06, "logits/chosen": -2.129112720489502, "logits/rejected": -2.0723516941070557, "logps/chosen": -50.255836486816406, "logps/rejected": -13.931672096252441, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": 2.7386720180511475, "rewards/margins": 1.9483362436294556, "rewards/rejected": 0.7903357744216919, "step": 6838 }, { "epoch": 1.51, "learning_rate": 1.4720784697630968e-06, "logits/chosen": -2.185023546218872, "logits/rejected": -2.185023546218872, "logps/chosen": -60.648136138916016, "logps/rejected": -60.648136138916016, "loss": 0.3475, "rewards/accuracies": 0.0, "rewards/chosen": 5.8463826179504395, "rewards/margins": 0.0, "rewards/rejected": 5.8463826179504395, "step": 6839 }, { "epoch": 1.51, "learning_rate": 1.4708086062053562e-06, "logits/chosen": -1.6856764554977417, "logits/rejected": -1.589237928390503, "logps/chosen": -91.99278259277344, "logps/rejected": -59.438934326171875, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": 7.9376540184021, "rewards/margins": 3.2909231185913086, "rewards/rejected": 4.646730899810791, "step": 6840 }, { "epoch": 1.51, "learning_rate": 1.4695391961402288e-06, "logits/chosen": -1.9455591440200806, "logits/rejected": -1.9182289838790894, "logps/chosen": -51.57238006591797, "logps/rejected": -45.456058502197266, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": 4.7808146476745605, "rewards/margins": 1.1436827182769775, "rewards/rejected": 3.637131929397583, "step": 6841 }, { "epoch": 1.51, "learning_rate": 1.4682702397308246e-06, "logits/chosen": -1.869462013244629, "logits/rejected": -1.76716947555542, "logps/chosen": -92.19420623779297, "logps/rejected": -226.49766540527344, "loss": 0.1348, "rewards/accuracies": 1.0, "rewards/chosen": 9.25471019744873, "rewards/margins": 1.337881088256836, "rewards/rejected": 7.9168291091918945, "step": 6842 }, { "epoch": 1.51, "learning_rate": 1.4670017371402074e-06, "logits/chosen": -1.8589329719543457, "logits/rejected": -1.6566816568374634, "logps/chosen": -84.88117980957031, "logps/rejected": -74.83472442626953, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 7.473247051239014, "rewards/margins": 6.514974594116211, "rewards/rejected": 0.9582725763320923, "step": 6843 }, { "epoch": 1.51, "learning_rate": 1.4657336885313728e-06, "logits/chosen": -2.0101675987243652, "logits/rejected": -2.0101675987243652, "logps/chosen": -41.30390930175781, "logps/rejected": -41.30390930175781, "loss": 1.005, "rewards/accuracies": 0.0, "rewards/chosen": 4.1191911697387695, "rewards/margins": 0.0, "rewards/rejected": 4.1191911697387695, "step": 6844 }, { "epoch": 1.52, "learning_rate": 1.4644660940672628e-06, "logits/chosen": -1.8925377130508423, "logits/rejected": -1.8759474754333496, "logps/chosen": -24.674406051635742, "logps/rejected": -25.635013580322266, "loss": 0.2429, "rewards/accuracies": 1.0, "rewards/chosen": 3.1057848930358887, "rewards/margins": 0.5637376308441162, "rewards/rejected": 2.5420472621917725, "step": 6845 }, { "epoch": 1.52, "learning_rate": 1.4631989539107605e-06, "logits/chosen": -1.9911755323410034, "logits/rejected": -1.8357107639312744, "logps/chosen": -102.91580200195312, "logps/rejected": -31.551654815673828, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 6.731626987457275, "rewards/margins": 5.723200798034668, "rewards/rejected": 1.008426308631897, "step": 6846 }, { "epoch": 1.52, "learning_rate": 1.4619322682246906e-06, "logits/chosen": -1.959588646888733, "logits/rejected": -1.959588646888733, "logps/chosen": -8.473989486694336, "logps/rejected": -8.473989486694336, "loss": 0.4169, "rewards/accuracies": 0.0, "rewards/chosen": 2.321714401245117, "rewards/margins": 0.0, "rewards/rejected": 2.321714401245117, "step": 6847 }, { "epoch": 1.52, "learning_rate": 1.4606660371718202e-06, "logits/chosen": -2.2459259033203125, "logits/rejected": -2.206343650817871, "logps/chosen": -105.7640151977539, "logps/rejected": -65.77574920654297, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 6.102913856506348, "rewards/margins": 2.3178467750549316, "rewards/rejected": 3.785067081451416, "step": 6848 }, { "epoch": 1.52, "learning_rate": 1.4594002609148528e-06, "logits/chosen": -1.8980692625045776, "logits/rejected": -1.8960703611373901, "logps/chosen": -47.451263427734375, "logps/rejected": -37.59001159667969, "loss": 0.2006, "rewards/accuracies": 1.0, "rewards/chosen": 4.372747898101807, "rewards/margins": 0.9624786376953125, "rewards/rejected": 3.410269260406494, "step": 6849 }, { "epoch": 1.52, "learning_rate": 1.4581349396164435e-06, "logits/chosen": -1.9976677894592285, "logits/rejected": -1.2058602571487427, "logps/chosen": -49.40544891357422, "logps/rejected": -128.21897888183594, "loss": 0.3521, "rewards/accuracies": 1.0, "rewards/chosen": 6.404112339019775, "rewards/margins": 0.1051177978515625, "rewards/rejected": 6.298994541168213, "step": 6850 }, { "epoch": 1.52, "learning_rate": 1.4568700734391772e-06, "logits/chosen": -1.9241156578063965, "logits/rejected": -1.8464338779449463, "logps/chosen": -54.00614929199219, "logps/rejected": -81.25531768798828, "loss": 0.0741, "rewards/accuracies": 1.0, "rewards/chosen": 6.745609283447266, "rewards/margins": 2.210587978363037, "rewards/rejected": 4.5350213050842285, "step": 6851 }, { "epoch": 1.52, "learning_rate": 1.4556056625455922e-06, "logits/chosen": -1.9808679819107056, "logits/rejected": -1.9703646898269653, "logps/chosen": -63.63887023925781, "logps/rejected": -102.46289825439453, "loss": 0.712, "rewards/accuracies": 0.0, "rewards/chosen": 8.28331470489502, "rewards/margins": -1.031412124633789, "rewards/rejected": 9.314726829528809, "step": 6852 }, { "epoch": 1.52, "learning_rate": 1.454341707098157e-06, "logits/chosen": -1.83323073387146, "logits/rejected": -1.789566159248352, "logps/chosen": -34.756690979003906, "logps/rejected": -59.269813537597656, "loss": 0.1589, "rewards/accuracies": 1.0, "rewards/chosen": 3.862866163253784, "rewards/margins": 1.0857703685760498, "rewards/rejected": 2.7770957946777344, "step": 6853 }, { "epoch": 1.52, "learning_rate": 1.4530782072592892e-06, "logits/chosen": -2.057788848876953, "logits/rejected": -2.092757225036621, "logps/chosen": -158.83706665039062, "logps/rejected": -134.642333984375, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": 8.60974407196045, "rewards/margins": 2.8109798431396484, "rewards/rejected": 5.798764228820801, "step": 6854 }, { "epoch": 1.52, "learning_rate": 1.4518151631913452e-06, "logits/chosen": -1.7750977277755737, "logits/rejected": -1.757246494293213, "logps/chosen": -21.241519927978516, "logps/rejected": -33.6839599609375, "loss": 0.7671, "rewards/accuracies": 0.0, "rewards/chosen": 2.3929755687713623, "rewards/margins": -1.2857170104980469, "rewards/rejected": 3.678692579269409, "step": 6855 }, { "epoch": 1.52, "learning_rate": 1.450552575056623e-06, "logits/chosen": -1.7651242017745972, "logits/rejected": -1.7920937538146973, "logps/chosen": -62.56632614135742, "logps/rejected": -69.05055236816406, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": 4.861847400665283, "rewards/margins": 1.9504642486572266, "rewards/rejected": 2.9113831520080566, "step": 6856 }, { "epoch": 1.52, "learning_rate": 1.4492904430173644e-06, "logits/chosen": -1.6436139345169067, "logits/rejected": -1.6529202461242676, "logps/chosen": -19.468036651611328, "logps/rejected": -50.32961654663086, "loss": 0.2164, "rewards/accuracies": 1.0, "rewards/chosen": 2.5092923641204834, "rewards/margins": 0.8029534816741943, "rewards/rejected": 1.706338882446289, "step": 6857 }, { "epoch": 1.52, "learning_rate": 1.448028767235744e-06, "logits/chosen": -2.041463613510132, "logits/rejected": -2.0764622688293457, "logps/chosen": -97.00762939453125, "logps/rejected": -150.48385620117188, "loss": 2.8362, "rewards/accuracies": 0.0, "rewards/chosen": 4.953652858734131, "rewards/margins": -5.668518543243408, "rewards/rejected": 10.622171401977539, "step": 6858 }, { "epoch": 1.52, "learning_rate": 1.4467675478738923e-06, "logits/chosen": -2.2221269607543945, "logits/rejected": -2.196063995361328, "logps/chosen": -102.3139419555664, "logps/rejected": -121.72948455810547, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": 10.86275863647461, "rewards/margins": 2.631985664367676, "rewards/rejected": 8.230772972106934, "step": 6859 }, { "epoch": 1.52, "learning_rate": 1.445506785093866e-06, "logits/chosen": -1.835727334022522, "logits/rejected": -1.7766042947769165, "logps/chosen": -47.37534713745117, "logps/rejected": -41.41119384765625, "loss": 0.7112, "rewards/accuracies": 0.0, "rewards/chosen": 4.165079116821289, "rewards/margins": -0.38555431365966797, "rewards/rejected": 4.550633430480957, "step": 6860 }, { "epoch": 1.52, "learning_rate": 1.4442464790576755e-06, "logits/chosen": -1.9816675186157227, "logits/rejected": -1.934660792350769, "logps/chosen": -66.08132934570312, "logps/rejected": -36.9352912902832, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 6.074427127838135, "rewards/margins": 3.7177231311798096, "rewards/rejected": 2.356703996658325, "step": 6861 }, { "epoch": 1.52, "learning_rate": 1.4429866299272631e-06, "logits/chosen": -1.9553638696670532, "logits/rejected": -1.9410086870193481, "logps/chosen": -105.05546569824219, "logps/rejected": -109.14369201660156, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 8.864967346191406, "rewards/margins": 6.541482448577881, "rewards/rejected": 2.3234848976135254, "step": 6862 }, { "epoch": 1.52, "learning_rate": 1.4417272378645182e-06, "logits/chosen": -1.7813578844070435, "logits/rejected": -1.6602312326431274, "logps/chosen": -52.983375549316406, "logps/rejected": -23.011749267578125, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": 3.8709511756896973, "rewards/margins": 2.659491777420044, "rewards/rejected": 1.2114593982696533, "step": 6863 }, { "epoch": 1.52, "learning_rate": 1.4404683030312694e-06, "logits/chosen": -1.9040498733520508, "logits/rejected": -1.8694026470184326, "logps/chosen": -51.180816650390625, "logps/rejected": -53.719451904296875, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 6.442185878753662, "rewards/margins": 3.2345945835113525, "rewards/rejected": 3.2075912952423096, "step": 6864 }, { "epoch": 1.52, "learning_rate": 1.4392098255892866e-06, "logits/chosen": -1.7408941984176636, "logits/rejected": -1.7230051755905151, "logps/chosen": -61.399925231933594, "logps/rejected": -28.920822143554688, "loss": 0.2179, "rewards/accuracies": 1.0, "rewards/chosen": 2.223470449447632, "rewards/margins": 0.898595929145813, "rewards/rejected": 1.3248745203018188, "step": 6865 }, { "epoch": 1.52, "learning_rate": 1.4379518057002834e-06, "logits/chosen": -1.9280873537063599, "logits/rejected": -1.9497759342193604, "logps/chosen": -35.1899299621582, "logps/rejected": -55.07746887207031, "loss": 0.5703, "rewards/accuracies": 1.0, "rewards/chosen": 5.490865707397461, "rewards/margins": 0.5725917816162109, "rewards/rejected": 4.91827392578125, "step": 6866 }, { "epoch": 1.52, "learning_rate": 1.4366942435259068e-06, "logits/chosen": -1.933331847190857, "logits/rejected": -1.9130593538284302, "logps/chosen": -72.58125305175781, "logps/rejected": -32.86257553100586, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": 5.651444911956787, "rewards/margins": 2.320795774459839, "rewards/rejected": 3.3306491374969482, "step": 6867 }, { "epoch": 1.52, "learning_rate": 1.4354371392277566e-06, "logits/chosen": -1.7701019048690796, "logits/rejected": -1.7271679639816284, "logps/chosen": -133.764892578125, "logps/rejected": -49.246826171875, "loss": 0.1151, "rewards/accuracies": 1.0, "rewards/chosen": 7.643179416656494, "rewards/margins": 2.817988395690918, "rewards/rejected": 4.825191020965576, "step": 6868 }, { "epoch": 1.52, "learning_rate": 1.4341804929673626e-06, "logits/chosen": -1.9812716245651245, "logits/rejected": -1.9073128700256348, "logps/chosen": -71.60240173339844, "logps/rejected": -24.155595779418945, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 3.005706787109375, "rewards/margins": 3.0792901515960693, "rewards/rejected": -0.07358341664075851, "step": 6869 }, { "epoch": 1.52, "learning_rate": 1.4329243049062058e-06, "logits/chosen": -2.0644478797912598, "logits/rejected": -1.3099682331085205, "logps/chosen": -47.077457427978516, "logps/rejected": -98.17912292480469, "loss": 0.1059, "rewards/accuracies": 1.0, "rewards/chosen": 5.948669910430908, "rewards/margins": 1.8711156845092773, "rewards/rejected": 4.077554225921631, "step": 6870 }, { "epoch": 1.52, "learning_rate": 1.4316685752056996e-06, "logits/chosen": -2.0460784435272217, "logits/rejected": -1.9137511253356934, "logps/chosen": -63.06553649902344, "logps/rejected": -18.993183135986328, "loss": 0.0873, "rewards/accuracies": 1.0, "rewards/chosen": 3.634469747543335, "rewards/margins": 1.883563756942749, "rewards/rejected": 1.750905990600586, "step": 6871 }, { "epoch": 1.52, "learning_rate": 1.4304133040272034e-06, "logits/chosen": -1.9011287689208984, "logits/rejected": -1.8718023300170898, "logps/chosen": -87.90511322021484, "logps/rejected": -66.01356506347656, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": 7.089850902557373, "rewards/margins": 4.092657089233398, "rewards/rejected": 2.9971940517425537, "step": 6872 }, { "epoch": 1.52, "learning_rate": 1.4291584915320166e-06, "logits/chosen": -2.2482967376708984, "logits/rejected": -2.239201545715332, "logps/chosen": -103.05951690673828, "logps/rejected": -148.61984252929688, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 11.936016082763672, "rewards/margins": 4.181392669677734, "rewards/rejected": 7.7546234130859375, "step": 6873 }, { "epoch": 1.52, "learning_rate": 1.4279041378813797e-06, "logits/chosen": -2.01279354095459, "logits/rejected": -1.998481035232544, "logps/chosen": -35.98213577270508, "logps/rejected": -39.244667053222656, "loss": 0.3842, "rewards/accuracies": 1.0, "rewards/chosen": 3.1564128398895264, "rewards/margins": 0.39450716972351074, "rewards/rejected": 2.7619056701660156, "step": 6874 }, { "epoch": 1.52, "learning_rate": 1.4266502432364737e-06, "logits/chosen": -1.955885887145996, "logits/rejected": -1.929898738861084, "logps/chosen": -112.801025390625, "logps/rejected": -112.19104766845703, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 6.475594997406006, "rewards/margins": 3.6167256832122803, "rewards/rejected": 2.8588693141937256, "step": 6875 }, { "epoch": 1.52, "learning_rate": 1.4253968077584218e-06, "logits/chosen": -1.740462303161621, "logits/rejected": -1.7296993732452393, "logps/chosen": -42.34026336669922, "logps/rejected": -90.94880676269531, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": 3.4406471252441406, "rewards/margins": 2.242642879486084, "rewards/rejected": 1.198004126548767, "step": 6876 }, { "epoch": 1.52, "learning_rate": 1.4241438316082884e-06, "logits/chosen": -1.7882392406463623, "logits/rejected": -1.7962486743927002, "logps/chosen": -51.15758514404297, "logps/rejected": -65.85195922851562, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 4.903975963592529, "rewards/margins": 1.430335521697998, "rewards/rejected": 3.4736404418945312, "step": 6877 }, { "epoch": 1.52, "learning_rate": 1.422891314947073e-06, "logits/chosen": -1.7052700519561768, "logits/rejected": -1.5232107639312744, "logps/chosen": -167.13058471679688, "logps/rejected": -63.0483512878418, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 7.3221282958984375, "rewards/margins": 4.217467308044434, "rewards/rejected": 3.104661226272583, "step": 6878 }, { "epoch": 1.52, "learning_rate": 1.421639257935728e-06, "logits/chosen": -2.1144585609436035, "logits/rejected": -2.0159571170806885, "logps/chosen": -86.71989440917969, "logps/rejected": -70.3880615234375, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 7.262132167816162, "rewards/margins": 5.250333786010742, "rewards/rejected": 2.011798143386841, "step": 6879 }, { "epoch": 1.52, "learning_rate": 1.4203876607351347e-06, "logits/chosen": -1.9785727262496948, "logits/rejected": -1.8216710090637207, "logps/chosen": -119.44297790527344, "logps/rejected": -55.32837677001953, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": 10.297938346862793, "rewards/margins": 2.8293447494506836, "rewards/rejected": 7.468593597412109, "step": 6880 }, { "epoch": 1.52, "learning_rate": 1.4191365235061216e-06, "logits/chosen": -1.9443496465682983, "logits/rejected": -1.9443496465682983, "logps/chosen": -23.30438995361328, "logps/rejected": -23.30438995361328, "loss": 0.4882, "rewards/accuracies": 0.0, "rewards/chosen": 7.480626583099365, "rewards/margins": 0.0, "rewards/rejected": 7.480626583099365, "step": 6881 }, { "epoch": 1.52, "learning_rate": 1.417885846409457e-06, "logits/chosen": -1.6973012685775757, "logits/rejected": -1.7318872213363647, "logps/chosen": -34.31310272216797, "logps/rejected": -49.01032257080078, "loss": 0.5029, "rewards/accuracies": 0.0, "rewards/chosen": 3.687053680419922, "rewards/margins": -0.5449981689453125, "rewards/rejected": 4.232051849365234, "step": 6882 }, { "epoch": 1.52, "learning_rate": 1.4166356296058504e-06, "logits/chosen": -2.249359369277954, "logits/rejected": -2.1775288581848145, "logps/chosen": -95.01565551757812, "logps/rejected": -67.61233520507812, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": 7.4651641845703125, "rewards/margins": 3.95595383644104, "rewards/rejected": 3.5092103481292725, "step": 6883 }, { "epoch": 1.52, "learning_rate": 1.4153858732559506e-06, "logits/chosen": -1.9839874505996704, "logits/rejected": -1.9561982154846191, "logps/chosen": -37.116920471191406, "logps/rejected": -52.02803421020508, "loss": 0.3281, "rewards/accuracies": 1.0, "rewards/chosen": 3.961449384689331, "rewards/margins": 0.08969378471374512, "rewards/rejected": 3.871755599975586, "step": 6884 }, { "epoch": 1.52, "learning_rate": 1.4141365775203497e-06, "logits/chosen": -2.175215721130371, "logits/rejected": -2.197028875350952, "logps/chosen": -64.2479248046875, "logps/rejected": -93.90032196044922, "loss": 0.2424, "rewards/accuracies": 1.0, "rewards/chosen": 10.170473098754883, "rewards/margins": 0.5130529403686523, "rewards/rejected": 9.65742015838623, "step": 6885 }, { "epoch": 1.52, "learning_rate": 1.4128877425595795e-06, "logits/chosen": -1.8141216039657593, "logits/rejected": -1.8429433107376099, "logps/chosen": -35.79963684082031, "logps/rejected": -48.41957092285156, "loss": 1.2429, "rewards/accuracies": 0.0, "rewards/chosen": 3.521840810775757, "rewards/margins": -1.8269593715667725, "rewards/rejected": 5.348800182342529, "step": 6886 }, { "epoch": 1.52, "learning_rate": 1.4116393685341096e-06, "logits/chosen": -1.8268471956253052, "logits/rejected": -1.783379077911377, "logps/chosen": -81.21534729003906, "logps/rejected": -56.23406219482422, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 7.417234897613525, "rewards/margins": 2.4883742332458496, "rewards/rejected": 4.928860664367676, "step": 6887 }, { "epoch": 1.52, "learning_rate": 1.4103914556043547e-06, "logits/chosen": -1.8193981647491455, "logits/rejected": -1.5831670761108398, "logps/chosen": -121.02570343017578, "logps/rejected": -60.7796745300293, "loss": 0.1231, "rewards/accuracies": 1.0, "rewards/chosen": 7.215856075286865, "rewards/margins": 4.188666820526123, "rewards/rejected": 3.027189254760742, "step": 6888 }, { "epoch": 1.52, "learning_rate": 1.4091440039306686e-06, "logits/chosen": -1.9287225008010864, "logits/rejected": -1.9341131448745728, "logps/chosen": -43.63426208496094, "logps/rejected": -51.37117004394531, "loss": 0.1149, "rewards/accuracies": 1.0, "rewards/chosen": 4.8205156326293945, "rewards/margins": 1.4088518619537354, "rewards/rejected": 3.411663770675659, "step": 6889 }, { "epoch": 1.53, "learning_rate": 1.4078970136733456e-06, "logits/chosen": -1.8040341138839722, "logits/rejected": -1.5237637758255005, "logps/chosen": -42.28204345703125, "logps/rejected": -54.39496994018555, "loss": 0.1642, "rewards/accuracies": 1.0, "rewards/chosen": 3.3501899242401123, "rewards/margins": 1.030888557434082, "rewards/rejected": 2.3193013668060303, "step": 6890 }, { "epoch": 1.53, "learning_rate": 1.4066504849926216e-06, "logits/chosen": -1.9663400650024414, "logits/rejected": -1.7630243301391602, "logps/chosen": -134.00665283203125, "logps/rejected": -45.33115768432617, "loss": 0.3611, "rewards/accuracies": 1.0, "rewards/chosen": 7.157273769378662, "rewards/margins": 3.5244462490081787, "rewards/rejected": 3.6328275203704834, "step": 6891 }, { "epoch": 1.53, "learning_rate": 1.405404418048672e-06, "logits/chosen": -1.873701810836792, "logits/rejected": -1.603818416595459, "logps/chosen": -197.8870086669922, "logps/rejected": -96.17015075683594, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": 10.378615379333496, "rewards/margins": 4.952685832977295, "rewards/rejected": 5.425929546356201, "step": 6892 }, { "epoch": 1.53, "learning_rate": 1.4041588130016132e-06, "logits/chosen": -2.0416955947875977, "logits/rejected": -1.9721635580062866, "logps/chosen": -73.82832336425781, "logps/rejected": -49.674320220947266, "loss": 0.172, "rewards/accuracies": 1.0, "rewards/chosen": 5.595558166503906, "rewards/margins": 0.9645256996154785, "rewards/rejected": 4.631032466888428, "step": 6893 }, { "epoch": 1.53, "learning_rate": 1.4029136700115031e-06, "logits/chosen": -1.8425389528274536, "logits/rejected": -1.8066202402114868, "logps/chosen": -120.21929168701172, "logps/rejected": -93.5338134765625, "loss": 0.2485, "rewards/accuracies": 1.0, "rewards/chosen": 8.335602760314941, "rewards/margins": 0.7135415077209473, "rewards/rejected": 7.622061252593994, "step": 6894 }, { "epoch": 1.53, "learning_rate": 1.4016689892383389e-06, "logits/chosen": -1.9092892408370972, "logits/rejected": -2.018718957901001, "logps/chosen": -24.108564376831055, "logps/rejected": -75.93576049804688, "loss": 1.8434, "rewards/accuracies": 0.0, "rewards/chosen": 4.725027561187744, "rewards/margins": -3.660778522491455, "rewards/rejected": 8.3858060836792, "step": 6895 }, { "epoch": 1.53, "learning_rate": 1.4004247708420615e-06, "logits/chosen": -1.9144448041915894, "logits/rejected": -1.9366482496261597, "logps/chosen": -27.373207092285156, "logps/rejected": -38.34880447387695, "loss": 0.5358, "rewards/accuracies": 0.0, "rewards/chosen": 3.1741273403167725, "rewards/margins": -0.5398752689361572, "rewards/rejected": 3.7140026092529297, "step": 6896 }, { "epoch": 1.53, "learning_rate": 1.3991810149825458e-06, "logits/chosen": -1.8546185493469238, "logits/rejected": -1.8428411483764648, "logps/chosen": -53.79396438598633, "logps/rejected": -40.890350341796875, "loss": 0.1726, "rewards/accuracies": 1.0, "rewards/chosen": 4.924435138702393, "rewards/margins": 1.1372876167297363, "rewards/rejected": 3.7871475219726562, "step": 6897 }, { "epoch": 1.53, "learning_rate": 1.3979377218196138e-06, "logits/chosen": -1.8535561561584473, "logits/rejected": -1.611690640449524, "logps/chosen": -77.33653259277344, "logps/rejected": -48.82062911987305, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": 3.0145676136016846, "rewards/margins": 2.033726453781128, "rewards/rejected": 0.9808411002159119, "step": 6898 }, { "epoch": 1.53, "learning_rate": 1.3966948915130257e-06, "logits/chosen": -2.07053804397583, "logits/rejected": -2.0260045528411865, "logps/chosen": -54.499664306640625, "logps/rejected": -85.18583679199219, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": 5.1255927085876465, "rewards/margins": 2.0968406200408936, "rewards/rejected": 3.028752088546753, "step": 6899 }, { "epoch": 1.53, "learning_rate": 1.3954525242224814e-06, "logits/chosen": -1.9105725288391113, "logits/rejected": -1.8583478927612305, "logps/chosen": -98.91361236572266, "logps/rejected": -108.46991729736328, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 11.163249015808105, "rewards/margins": 2.3080339431762695, "rewards/rejected": 8.855215072631836, "step": 6900 }, { "epoch": 1.53, "learning_rate": 1.3942106201076233e-06, "logits/chosen": -2.085937976837158, "logits/rejected": -2.1110727787017822, "logps/chosen": -83.36146545410156, "logps/rejected": -141.1451873779297, "loss": 0.1831, "rewards/accuracies": 1.0, "rewards/chosen": 10.299493789672852, "rewards/margins": 0.8238668441772461, "rewards/rejected": 9.475626945495605, "step": 6901 }, { "epoch": 1.53, "learning_rate": 1.392969179328032e-06, "logits/chosen": -1.706334114074707, "logits/rejected": -1.6731641292572021, "logps/chosen": -34.224605560302734, "logps/rejected": -81.95452880859375, "loss": 0.2492, "rewards/accuracies": 1.0, "rewards/chosen": 4.852962017059326, "rewards/margins": 0.5789051055908203, "rewards/rejected": 4.274056911468506, "step": 6902 }, { "epoch": 1.53, "learning_rate": 1.39172820204323e-06, "logits/chosen": -2.3222529888153076, "logits/rejected": -2.3120319843292236, "logps/chosen": -41.31599426269531, "logps/rejected": -20.3856201171875, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": 3.2774384021759033, "rewards/margins": 2.589101552963257, "rewards/rejected": 0.6883367896080017, "step": 6903 }, { "epoch": 1.53, "learning_rate": 1.39048768841268e-06, "logits/chosen": -1.9444631338119507, "logits/rejected": -1.8605096340179443, "logps/chosen": -107.849609375, "logps/rejected": -55.179656982421875, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": 6.545477390289307, "rewards/margins": 3.1469407081604004, "rewards/rejected": 3.3985366821289062, "step": 6904 }, { "epoch": 1.53, "learning_rate": 1.389247638595787e-06, "logits/chosen": -2.0756375789642334, "logits/rejected": -2.0473756790161133, "logps/chosen": -59.52812957763672, "logps/rejected": -58.06006622314453, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 5.711472988128662, "rewards/margins": 2.240710973739624, "rewards/rejected": 3.470762014389038, "step": 6905 }, { "epoch": 1.53, "learning_rate": 1.3880080527518892e-06, "logits/chosen": -1.8589942455291748, "logits/rejected": -1.8589942455291748, "logps/chosen": -63.94044494628906, "logps/rejected": -63.94044494628906, "loss": 0.9508, "rewards/accuracies": 0.0, "rewards/chosen": 4.245660305023193, "rewards/margins": 0.0, "rewards/rejected": 4.245660305023193, "step": 6906 }, { "epoch": 1.53, "learning_rate": 1.3867689310402765e-06, "logits/chosen": -1.2728965282440186, "logits/rejected": -1.2728965282440186, "logps/chosen": -4.5165510177612305, "logps/rejected": -4.5165510177612305, "loss": 1.3498, "rewards/accuracies": 0.0, "rewards/chosen": 0.7871292233467102, "rewards/margins": 0.0, "rewards/rejected": 0.7871292233467102, "step": 6907 }, { "epoch": 1.53, "learning_rate": 1.3855302736201686e-06, "logits/chosen": -2.035414695739746, "logits/rejected": -2.1236696243286133, "logps/chosen": -53.779327392578125, "logps/rejected": -138.92562866210938, "loss": 0.9881, "rewards/accuracies": 0.0, "rewards/chosen": 6.8383708000183105, "rewards/margins": -1.8104147911071777, "rewards/rejected": 8.648785591125488, "step": 6908 }, { "epoch": 1.53, "learning_rate": 1.3842920806507315e-06, "logits/chosen": -2.2339932918548584, "logits/rejected": -2.0940957069396973, "logps/chosen": -186.8834228515625, "logps/rejected": -65.6054458618164, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": 6.827455043792725, "rewards/margins": 3.5581350326538086, "rewards/rejected": 3.269320011138916, "step": 6909 }, { "epoch": 1.53, "learning_rate": 1.3830543522910705e-06, "logits/chosen": -1.6836800575256348, "logits/rejected": -1.6809470653533936, "logps/chosen": -47.946372985839844, "logps/rejected": -64.27086639404297, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": 4.009281158447266, "rewards/margins": 2.124211072921753, "rewards/rejected": 1.8850700855255127, "step": 6910 }, { "epoch": 1.53, "learning_rate": 1.3818170887002303e-06, "logits/chosen": -1.9064134359359741, "logits/rejected": -1.9064134359359741, "logps/chosen": -34.571922302246094, "logps/rejected": -34.571922302246094, "loss": 0.3483, "rewards/accuracies": 0.0, "rewards/chosen": 5.329219818115234, "rewards/margins": 0.0, "rewards/rejected": 5.329219818115234, "step": 6911 }, { "epoch": 1.53, "learning_rate": 1.3805802900371962e-06, "logits/chosen": -1.958726406097412, "logits/rejected": -1.967725157737732, "logps/chosen": -138.61663818359375, "logps/rejected": -166.0460662841797, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": 13.192733764648438, "rewards/margins": 4.237301826477051, "rewards/rejected": 8.955431938171387, "step": 6912 }, { "epoch": 1.53, "learning_rate": 1.379343956460894e-06, "logits/chosen": -1.6624677181243896, "logits/rejected": -1.6624677181243896, "logps/chosen": -22.721458435058594, "logps/rejected": -22.721458435058594, "loss": 0.3469, "rewards/accuracies": 0.0, "rewards/chosen": 2.445152759552002, "rewards/margins": 0.0, "rewards/rejected": 2.445152759552002, "step": 6913 }, { "epoch": 1.53, "learning_rate": 1.3781080881301912e-06, "logits/chosen": -1.9801344871520996, "logits/rejected": -1.9882797002792358, "logps/chosen": -37.75217056274414, "logps/rejected": -35.615848541259766, "loss": 0.6078, "rewards/accuracies": 0.0, "rewards/chosen": 2.5735538005828857, "rewards/margins": -0.7098572254180908, "rewards/rejected": 3.2834110260009766, "step": 6914 }, { "epoch": 1.53, "learning_rate": 1.376872685203889e-06, "logits/chosen": -2.246892213821411, "logits/rejected": -2.206123113632202, "logps/chosen": -85.00117492675781, "logps/rejected": -139.50531005859375, "loss": 0.3509, "rewards/accuracies": 1.0, "rewards/chosen": 11.20256519317627, "rewards/margins": 0.13654041290283203, "rewards/rejected": 11.066024780273438, "step": 6915 }, { "epoch": 1.53, "learning_rate": 1.3756377478407402e-06, "logits/chosen": -1.7283077239990234, "logits/rejected": -1.6988532543182373, "logps/chosen": -62.04804992675781, "logps/rejected": -87.29461669921875, "loss": 0.1359, "rewards/accuracies": 1.0, "rewards/chosen": 4.007384777069092, "rewards/margins": 1.371307611465454, "rewards/rejected": 2.6360771656036377, "step": 6916 }, { "epoch": 1.53, "learning_rate": 1.3744032761994264e-06, "logits/chosen": -2.1598570346832275, "logits/rejected": -2.1949894428253174, "logps/chosen": -80.17508697509766, "logps/rejected": -139.32839965820312, "loss": 1.6511, "rewards/accuracies": 0.0, "rewards/chosen": 4.965059757232666, "rewards/margins": -3.246471881866455, "rewards/rejected": 8.211531639099121, "step": 6917 }, { "epoch": 1.53, "learning_rate": 1.3731692704385762e-06, "logits/chosen": -1.865033507347107, "logits/rejected": -1.7265965938568115, "logps/chosen": -37.52095031738281, "logps/rejected": -24.793071746826172, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": 2.940211534500122, "rewards/margins": 1.610547661781311, "rewards/rejected": 1.329663872718811, "step": 6918 }, { "epoch": 1.53, "learning_rate": 1.3719357307167558e-06, "logits/chosen": -1.922922968864441, "logits/rejected": -1.814643383026123, "logps/chosen": -69.59007263183594, "logps/rejected": -12.612506866455078, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 5.633460998535156, "rewards/margins": 2.8497860431671143, "rewards/rejected": 2.783674955368042, "step": 6919 }, { "epoch": 1.53, "learning_rate": 1.3707026571924726e-06, "logits/chosen": -1.8108570575714111, "logits/rejected": -1.6649744510650635, "logps/chosen": -62.34373474121094, "logps/rejected": -4.9789204597473145, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": 3.648890733718872, "rewards/margins": 2.9335227012634277, "rewards/rejected": 0.7153679728507996, "step": 6920 }, { "epoch": 1.53, "learning_rate": 1.369470050024173e-06, "logits/chosen": -1.8696792125701904, "logits/rejected": -1.8692646026611328, "logps/chosen": -36.79417037963867, "logps/rejected": -37.25695037841797, "loss": 0.3555, "rewards/accuracies": 1.0, "rewards/chosen": 4.025125503540039, "rewards/margins": 0.13822507858276367, "rewards/rejected": 3.8869004249572754, "step": 6921 }, { "epoch": 1.53, "learning_rate": 1.3682379093702447e-06, "logits/chosen": -1.9395442008972168, "logits/rejected": -1.9437509775161743, "logps/chosen": -29.41135025024414, "logps/rejected": -42.92268753051758, "loss": 0.2519, "rewards/accuracies": 1.0, "rewards/chosen": 4.142670154571533, "rewards/margins": 0.5108826160430908, "rewards/rejected": 3.6317875385284424, "step": 6922 }, { "epoch": 1.53, "learning_rate": 1.3670062353890163e-06, "logits/chosen": -2.0485854148864746, "logits/rejected": -2.0353636741638184, "logps/chosen": -78.59371948242188, "logps/rejected": -43.86848831176758, "loss": 0.2215, "rewards/accuracies": 1.0, "rewards/chosen": 4.885584354400635, "rewards/margins": 0.750300407409668, "rewards/rejected": 4.135283946990967, "step": 6923 }, { "epoch": 1.53, "learning_rate": 1.36577502823875e-06, "logits/chosen": -1.61369788646698, "logits/rejected": -1.5957810878753662, "logps/chosen": -36.222862243652344, "logps/rejected": -54.12161636352539, "loss": 0.4497, "rewards/accuracies": 1.0, "rewards/chosen": 2.103743314743042, "rewards/margins": 1.4734461307525635, "rewards/rejected": 0.6302971243858337, "step": 6924 }, { "epoch": 1.53, "learning_rate": 1.364544288077659e-06, "logits/chosen": -1.8891371488571167, "logits/rejected": -1.7476625442504883, "logps/chosen": -65.40506744384766, "logps/rejected": -35.15960693359375, "loss": 0.344, "rewards/accuracies": 1.0, "rewards/chosen": 2.353012800216675, "rewards/margins": 0.05532670021057129, "rewards/rejected": 2.2976861000061035, "step": 6925 }, { "epoch": 1.53, "learning_rate": 1.3633140150638846e-06, "logits/chosen": -1.9932441711425781, "logits/rejected": -2.0184438228607178, "logps/chosen": -91.26778411865234, "logps/rejected": -108.75050354003906, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 6.347472667694092, "rewards/margins": 4.120032787322998, "rewards/rejected": 2.2274398803710938, "step": 6926 }, { "epoch": 1.53, "learning_rate": 1.3620842093555197e-06, "logits/chosen": -1.9741462469100952, "logits/rejected": -2.028724431991577, "logps/chosen": -70.59131622314453, "logps/rejected": -111.0255126953125, "loss": 1.5155, "rewards/accuracies": 0.0, "rewards/chosen": 5.9329352378845215, "rewards/margins": -2.957335948944092, "rewards/rejected": 8.890271186828613, "step": 6927 }, { "epoch": 1.53, "learning_rate": 1.3608548711105874e-06, "logits/chosen": -2.243920087814331, "logits/rejected": -2.141775608062744, "logps/chosen": -70.26082611083984, "logps/rejected": -43.43222427368164, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 6.897501468658447, "rewards/margins": 5.691516399383545, "rewards/rejected": 1.2059849500656128, "step": 6928 }, { "epoch": 1.53, "learning_rate": 1.359626000487056e-06, "logits/chosen": -1.811967134475708, "logits/rejected": -1.8512754440307617, "logps/chosen": -76.98187255859375, "logps/rejected": -164.6134033203125, "loss": 0.9047, "rewards/accuracies": 0.0, "rewards/chosen": 11.620640754699707, "rewards/margins": -1.5487689971923828, "rewards/rejected": 13.16940975189209, "step": 6929 }, { "epoch": 1.53, "learning_rate": 1.358397597642832e-06, "logits/chosen": -1.7941863536834717, "logits/rejected": -1.6474522352218628, "logps/chosen": -153.15560913085938, "logps/rejected": -32.83749771118164, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": 5.71307373046875, "rewards/margins": 4.004467964172363, "rewards/rejected": 1.7086056470870972, "step": 6930 }, { "epoch": 1.53, "learning_rate": 1.3571696627357628e-06, "logits/chosen": -1.54474937915802, "logits/rejected": -1.4753727912902832, "logps/chosen": -49.26806640625, "logps/rejected": -54.91108703613281, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": 4.434286594390869, "rewards/margins": 2.248495578765869, "rewards/rejected": 2.185791015625, "step": 6931 }, { "epoch": 1.53, "learning_rate": 1.3559421959236357e-06, "logits/chosen": -1.797560453414917, "logits/rejected": -1.7507522106170654, "logps/chosen": -51.42741394042969, "logps/rejected": -10.641606330871582, "loss": 0.2826, "rewards/accuracies": 1.0, "rewards/chosen": 3.4918320178985596, "rewards/margins": 2.3461523056030273, "rewards/rejected": 1.1456798315048218, "step": 6932 }, { "epoch": 1.53, "learning_rate": 1.3547151973641743e-06, "logits/chosen": -1.7719230651855469, "logits/rejected": -1.79082190990448, "logps/chosen": -45.74500274658203, "logps/rejected": -84.09994506835938, "loss": 0.7665, "rewards/accuracies": 0.0, "rewards/chosen": 5.474137306213379, "rewards/margins": -1.2898154258728027, "rewards/rejected": 6.763952732086182, "step": 6933 }, { "epoch": 1.53, "learning_rate": 1.3534886672150493e-06, "logits/chosen": -2.008695602416992, "logits/rejected": -1.9311550855636597, "logps/chosen": -58.331809997558594, "logps/rejected": -9.429169654846191, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 5.856905460357666, "rewards/margins": 3.9414539337158203, "rewards/rejected": 1.9154514074325562, "step": 6934 }, { "epoch": 1.53, "learning_rate": 1.3522626056338618e-06, "logits/chosen": -1.808340311050415, "logits/rejected": -1.8334662914276123, "logps/chosen": -40.030670166015625, "logps/rejected": -54.30011749267578, "loss": 0.7041, "rewards/accuracies": 1.0, "rewards/chosen": 3.4261536598205566, "rewards/margins": 0.2165367603302002, "rewards/rejected": 3.2096168994903564, "step": 6935 }, { "epoch": 1.54, "learning_rate": 1.3510370127781635e-06, "logits/chosen": -1.8202526569366455, "logits/rejected": -1.828020691871643, "logps/chosen": -47.560943603515625, "logps/rejected": -72.71708679199219, "loss": 0.4776, "rewards/accuracies": 0.0, "rewards/chosen": 3.410830020904541, "rewards/margins": -0.3800995349884033, "rewards/rejected": 3.7909295558929443, "step": 6936 }, { "epoch": 1.54, "learning_rate": 1.3498118888054363e-06, "logits/chosen": -1.8443493843078613, "logits/rejected": -1.8207415342330933, "logps/chosen": -26.81780242919922, "logps/rejected": -35.64918518066406, "loss": 0.8255, "rewards/accuracies": 0.0, "rewards/chosen": 2.6918885707855225, "rewards/margins": -1.2363455295562744, "rewards/rejected": 3.928234100341797, "step": 6937 }, { "epoch": 1.54, "learning_rate": 1.348587233873106e-06, "logits/chosen": -1.9840213060379028, "logits/rejected": -1.9746675491333008, "logps/chosen": -71.54132080078125, "logps/rejected": -66.12340545654297, "loss": 0.161, "rewards/accuracies": 1.0, "rewards/chosen": 4.463258266448975, "rewards/margins": 1.0101141929626465, "rewards/rejected": 3.453144073486328, "step": 6938 }, { "epoch": 1.54, "learning_rate": 1.3473630481385397e-06, "logits/chosen": -1.7945727109909058, "logits/rejected": -1.7704241275787354, "logps/chosen": -60.093963623046875, "logps/rejected": -47.673824310302734, "loss": 0.1895, "rewards/accuracies": 1.0, "rewards/chosen": 3.4703903198242188, "rewards/margins": 0.8006908893585205, "rewards/rejected": 2.6696994304656982, "step": 6939 }, { "epoch": 1.54, "learning_rate": 1.3461393317590383e-06, "logits/chosen": -1.9603122472763062, "logits/rejected": -1.9254499673843384, "logps/chosen": -85.23246765136719, "logps/rejected": -51.495643615722656, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 5.830644130706787, "rewards/margins": 2.2976415157318115, "rewards/rejected": 3.5330026149749756, "step": 6940 }, { "epoch": 1.54, "learning_rate": 1.3449160848918524e-06, "logits/chosen": -1.7903099060058594, "logits/rejected": -1.7801822423934937, "logps/chosen": -70.13275146484375, "logps/rejected": -68.3753433227539, "loss": 0.5365, "rewards/accuracies": 1.0, "rewards/chosen": 4.828819274902344, "rewards/margins": 2.683896541595459, "rewards/rejected": 2.1449227333068848, "step": 6941 }, { "epoch": 1.54, "learning_rate": 1.3436933076941595e-06, "logits/chosen": -2.085205554962158, "logits/rejected": -1.9471369981765747, "logps/chosen": -60.43871307373047, "logps/rejected": -31.179418563842773, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 7.112153053283691, "rewards/margins": 6.716719627380371, "rewards/rejected": 0.395433634519577, "step": 6942 }, { "epoch": 1.54, "learning_rate": 1.34247100032309e-06, "logits/chosen": -1.7538115978240967, "logits/rejected": -1.7544077634811401, "logps/chosen": -27.894920349121094, "logps/rejected": -34.78582763671875, "loss": 0.69, "rewards/accuracies": 0.0, "rewards/chosen": 3.2708847522735596, "rewards/margins": -1.0860435962677002, "rewards/rejected": 4.35692834854126, "step": 6943 }, { "epoch": 1.54, "learning_rate": 1.3412491629357023e-06, "logits/chosen": -1.637089729309082, "logits/rejected": -1.637089729309082, "logps/chosen": -16.28890037536621, "logps/rejected": -16.28890037536621, "loss": 0.3766, "rewards/accuracies": 0.0, "rewards/chosen": 2.8833823204040527, "rewards/margins": 0.0, "rewards/rejected": 2.8833823204040527, "step": 6944 }, { "epoch": 1.54, "learning_rate": 1.340027795689004e-06, "logits/chosen": -1.7778962850570679, "logits/rejected": -1.7605551481246948, "logps/chosen": -52.13645553588867, "logps/rejected": -72.29019165039062, "loss": 0.4038, "rewards/accuracies": 1.0, "rewards/chosen": 3.6992268562316895, "rewards/margins": 1.4163014888763428, "rewards/rejected": 2.2829253673553467, "step": 6945 }, { "epoch": 1.54, "learning_rate": 1.3388068987399343e-06, "logits/chosen": -1.7030930519104004, "logits/rejected": -1.795607089996338, "logps/chosen": -24.508773803710938, "logps/rejected": -64.25605773925781, "loss": 0.3648, "rewards/accuracies": 1.0, "rewards/chosen": 3.2747018337249756, "rewards/margins": 0.22976922988891602, "rewards/rejected": 3.0449326038360596, "step": 6946 }, { "epoch": 1.54, "learning_rate": 1.337586472245377e-06, "logits/chosen": -1.9862265586853027, "logits/rejected": -1.9007891416549683, "logps/chosen": -151.04180908203125, "logps/rejected": -47.358970642089844, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": 6.914346218109131, "rewards/margins": 4.650834083557129, "rewards/rejected": 2.263512372970581, "step": 6947 }, { "epoch": 1.54, "learning_rate": 1.3363665163621554e-06, "logits/chosen": -1.7282897233963013, "logits/rejected": -1.6317821741104126, "logps/chosen": -102.30791473388672, "logps/rejected": -59.32868576049805, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": 6.5815043449401855, "rewards/margins": 4.87696647644043, "rewards/rejected": 1.7045376300811768, "step": 6948 }, { "epoch": 1.54, "learning_rate": 1.3351470312470261e-06, "logits/chosen": -1.8351114988327026, "logits/rejected": -1.8658872842788696, "logps/chosen": -74.09945678710938, "logps/rejected": -105.93177795410156, "loss": 0.3485, "rewards/accuracies": 1.0, "rewards/chosen": 8.166394233703613, "rewards/margins": 0.14646434783935547, "rewards/rejected": 8.019929885864258, "step": 6949 }, { "epoch": 1.54, "learning_rate": 1.3339280170566959e-06, "logits/chosen": -2.03660249710083, "logits/rejected": -1.978613018989563, "logps/chosen": -74.04664611816406, "logps/rejected": -61.90787887573242, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 6.112074375152588, "rewards/margins": 2.9784634113311768, "rewards/rejected": 3.133610963821411, "step": 6950 }, { "epoch": 1.54, "learning_rate": 1.3327094739478003e-06, "logits/chosen": -1.5879300832748413, "logits/rejected": -1.5879943370819092, "logps/chosen": -52.45075988769531, "logps/rejected": -37.587825775146484, "loss": 0.0771, "rewards/accuracies": 1.0, "rewards/chosen": 4.565001010894775, "rewards/margins": 1.8699214458465576, "rewards/rejected": 2.6950795650482178, "step": 6951 }, { "epoch": 1.54, "learning_rate": 1.3314914020769237e-06, "logits/chosen": -1.6455326080322266, "logits/rejected": -1.6455326080322266, "logps/chosen": -26.90949821472168, "logps/rejected": -26.90949821472168, "loss": 0.3507, "rewards/accuracies": 0.0, "rewards/chosen": 3.0839712619781494, "rewards/margins": 0.0, "rewards/rejected": 3.0839712619781494, "step": 6952 }, { "epoch": 1.54, "learning_rate": 1.3302738016005806e-06, "logits/chosen": -1.9758697748184204, "logits/rejected": -1.9879951477050781, "logps/chosen": -56.52899932861328, "logps/rejected": -55.16535949707031, "loss": 0.1671, "rewards/accuracies": 1.0, "rewards/chosen": 4.459959506988525, "rewards/margins": 1.015956163406372, "rewards/rejected": 3.4440033435821533, "step": 6953 }, { "epoch": 1.54, "learning_rate": 1.3290566726752346e-06, "logits/chosen": -1.8724712133407593, "logits/rejected": -1.8391201496124268, "logps/chosen": -25.757047653198242, "logps/rejected": -12.888171195983887, "loss": 0.3144, "rewards/accuracies": 1.0, "rewards/chosen": 3.262763261795044, "rewards/margins": 0.6392326354980469, "rewards/rejected": 2.623530626296997, "step": 6954 }, { "epoch": 1.54, "learning_rate": 1.3278400154572807e-06, "logits/chosen": -2.1101601123809814, "logits/rejected": -1.9498571157455444, "logps/chosen": -137.0220947265625, "logps/rejected": -38.709144592285156, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 5.921521186828613, "rewards/margins": 4.4496612548828125, "rewards/rejected": 1.4718598127365112, "step": 6955 }, { "epoch": 1.54, "learning_rate": 1.3266238301030566e-06, "logits/chosen": -1.9190014600753784, "logits/rejected": -1.8724207878112793, "logps/chosen": -33.59218215942383, "logps/rejected": -26.104799270629883, "loss": 0.0816, "rewards/accuracies": 1.0, "rewards/chosen": 3.4971530437469482, "rewards/margins": 1.7797490358352661, "rewards/rejected": 1.7174040079116821, "step": 6956 }, { "epoch": 1.54, "learning_rate": 1.3254081167688398e-06, "logits/chosen": -1.9959743022918701, "logits/rejected": -1.570865273475647, "logps/chosen": -28.507305145263672, "logps/rejected": -152.54379272460938, "loss": 1.2205, "rewards/accuracies": 0.0, "rewards/chosen": 4.243936538696289, "rewards/margins": -2.3412137031555176, "rewards/rejected": 6.585150241851807, "step": 6957 }, { "epoch": 1.54, "learning_rate": 1.3241928756108468e-06, "logits/chosen": -2.1438076496124268, "logits/rejected": -2.139054775238037, "logps/chosen": -102.02397155761719, "logps/rejected": -78.19367980957031, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 6.372531414031982, "rewards/margins": 3.1968905925750732, "rewards/rejected": 3.175640821456909, "step": 6958 }, { "epoch": 1.54, "learning_rate": 1.3229781067852343e-06, "logits/chosen": -2.0193333625793457, "logits/rejected": -2.057857036590576, "logps/chosen": -67.46029663085938, "logps/rejected": -110.12466430664062, "loss": 0.126, "rewards/accuracies": 1.0, "rewards/chosen": 9.54389476776123, "rewards/margins": 1.859147548675537, "rewards/rejected": 7.684747219085693, "step": 6959 }, { "epoch": 1.54, "learning_rate": 1.321763810448093e-06, "logits/chosen": -1.8510724306106567, "logits/rejected": -1.8594586849212646, "logps/chosen": -30.782642364501953, "logps/rejected": -38.48577880859375, "loss": 0.2254, "rewards/accuracies": 1.0, "rewards/chosen": 3.9972286224365234, "rewards/margins": 0.7446987628936768, "rewards/rejected": 3.2525298595428467, "step": 6960 }, { "epoch": 1.54, "learning_rate": 1.3205499867554627e-06, "logits/chosen": -2.1950857639312744, "logits/rejected": -2.1950857639312744, "logps/chosen": -31.60770606994629, "logps/rejected": -31.60770606994629, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.6548441648483276, "rewards/margins": 0.0, "rewards/rejected": 1.6548441648483276, "step": 6961 }, { "epoch": 1.54, "learning_rate": 1.3193366358633119e-06, "logits/chosen": -1.6757152080535889, "logits/rejected": -1.6757152080535889, "logps/chosen": -50.51563262939453, "logps/rejected": -50.51563262939453, "loss": 0.5298, "rewards/accuracies": 0.0, "rewards/chosen": 3.1668801307678223, "rewards/margins": 0.0, "rewards/rejected": 3.1668801307678223, "step": 6962 }, { "epoch": 1.54, "learning_rate": 1.3181237579275557e-06, "logits/chosen": -1.832069993019104, "logits/rejected": -1.8559203147888184, "logps/chosen": -28.46459197998047, "logps/rejected": -61.11421203613281, "loss": 0.6266, "rewards/accuracies": 1.0, "rewards/chosen": 3.3693559169769287, "rewards/margins": 1.611343502998352, "rewards/rejected": 1.7580124139785767, "step": 6963 }, { "epoch": 1.54, "learning_rate": 1.3169113531040462e-06, "logits/chosen": -2.0203986167907715, "logits/rejected": -1.9620580673217773, "logps/chosen": -31.605567932128906, "logps/rejected": -35.254676818847656, "loss": 0.5051, "rewards/accuracies": 1.0, "rewards/chosen": 2.970475912094116, "rewards/margins": 2.2619683742523193, "rewards/rejected": 0.7085075378417969, "step": 6964 }, { "epoch": 1.54, "learning_rate": 1.315699421548573e-06, "logits/chosen": -1.8333126306533813, "logits/rejected": -1.756890058517456, "logps/chosen": -48.070556640625, "logps/rejected": -48.470184326171875, "loss": 0.2499, "rewards/accuracies": 1.0, "rewards/chosen": 4.142858982086182, "rewards/margins": 0.8161606788635254, "rewards/rejected": 3.3266983032226562, "step": 6965 }, { "epoch": 1.54, "learning_rate": 1.3144879634168678e-06, "logits/chosen": -2.2109997272491455, "logits/rejected": -2.195087194442749, "logps/chosen": -16.418249130249023, "logps/rejected": -34.16325759887695, "loss": 0.5876, "rewards/accuracies": 1.0, "rewards/chosen": 3.322737216949463, "rewards/margins": 0.4792320728302002, "rewards/rejected": 2.8435051441192627, "step": 6966 }, { "epoch": 1.54, "learning_rate": 1.3132769788645995e-06, "logits/chosen": -2.0123555660247803, "logits/rejected": -2.0173399448394775, "logps/chosen": -52.3707275390625, "logps/rejected": -57.895408630371094, "loss": 0.8207, "rewards/accuracies": 0.0, "rewards/chosen": 5.355155944824219, "rewards/margins": -1.419161319732666, "rewards/rejected": 6.774317264556885, "step": 6967 }, { "epoch": 1.54, "learning_rate": 1.3120664680473782e-06, "logits/chosen": -2.048393726348877, "logits/rejected": -1.96135413646698, "logps/chosen": -106.99301147460938, "logps/rejected": -57.759063720703125, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 5.361944675445557, "rewards/margins": 3.813772678375244, "rewards/rejected": 1.5481719970703125, "step": 6968 }, { "epoch": 1.54, "learning_rate": 1.3108564311207471e-06, "logits/chosen": -1.8412741422653198, "logits/rejected": -1.841894507408142, "logps/chosen": -25.068254470825195, "logps/rejected": -51.48210906982422, "loss": 0.3266, "rewards/accuracies": 1.0, "rewards/chosen": 2.4682066440582275, "rewards/margins": 0.5604406595230103, "rewards/rejected": 1.9077659845352173, "step": 6969 }, { "epoch": 1.54, "learning_rate": 1.3096468682401998e-06, "logits/chosen": -1.9534416198730469, "logits/rejected": -2.002199172973633, "logps/chosen": -20.74896240234375, "logps/rejected": -99.27377319335938, "loss": 1.1687, "rewards/accuracies": 0.0, "rewards/chosen": 3.0616912841796875, "rewards/margins": -1.6903200149536133, "rewards/rejected": 4.752011299133301, "step": 6970 }, { "epoch": 1.54, "learning_rate": 1.3084377795611563e-06, "logits/chosen": -1.9435560703277588, "logits/rejected": -1.9514243602752686, "logps/chosen": -43.06947326660156, "logps/rejected": -85.05789184570312, "loss": 0.2539, "rewards/accuracies": 1.0, "rewards/chosen": 4.46105432510376, "rewards/margins": 0.7006690502166748, "rewards/rejected": 3.760385274887085, "step": 6971 }, { "epoch": 1.54, "learning_rate": 1.307229165238984e-06, "logits/chosen": -1.6808475255966187, "logits/rejected": -1.650225043296814, "logps/chosen": -174.00942993164062, "logps/rejected": -48.398616790771484, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 5.4372148513793945, "rewards/margins": 2.587968111038208, "rewards/rejected": 2.8492467403411865, "step": 6972 }, { "epoch": 1.54, "learning_rate": 1.3060210254289873e-06, "logits/chosen": -2.0854547023773193, "logits/rejected": -2.05119252204895, "logps/chosen": -57.683135986328125, "logps/rejected": -25.30721664428711, "loss": 0.3793, "rewards/accuracies": 0.0, "rewards/chosen": 2.8830933570861816, "rewards/margins": -0.11626172065734863, "rewards/rejected": 2.9993550777435303, "step": 6973 }, { "epoch": 1.54, "learning_rate": 1.3048133602864083e-06, "logits/chosen": -1.7559301853179932, "logits/rejected": -1.5609526634216309, "logps/chosen": -101.26290130615234, "logps/rejected": -30.088287353515625, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": 3.123297929763794, "rewards/margins": 2.8977229595184326, "rewards/rejected": 0.22557488083839417, "step": 6974 }, { "epoch": 1.54, "learning_rate": 1.3036061699664294e-06, "logits/chosen": -1.9096031188964844, "logits/rejected": -1.9096031188964844, "logps/chosen": -5.8412322998046875, "logps/rejected": -5.8412322998046875, "loss": 1.6584, "rewards/accuracies": 0.0, "rewards/chosen": 2.1856982707977295, "rewards/margins": 0.0, "rewards/rejected": 2.1856982707977295, "step": 6975 }, { "epoch": 1.54, "learning_rate": 1.302399454624172e-06, "logits/chosen": -1.8259919881820679, "logits/rejected": -1.5043705701828003, "logps/chosen": -46.99005126953125, "logps/rejected": -54.30784225463867, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": 3.274951219558716, "rewards/margins": 2.271033763885498, "rewards/rejected": 1.0039173364639282, "step": 6976 }, { "epoch": 1.54, "learning_rate": 1.301193214414696e-06, "logits/chosen": -2.278320074081421, "logits/rejected": -2.265042781829834, "logps/chosen": -46.44074630737305, "logps/rejected": -67.55215454101562, "loss": 0.6614, "rewards/accuracies": 1.0, "rewards/chosen": 3.6744625568389893, "rewards/margins": 1.9702380895614624, "rewards/rejected": 1.7042244672775269, "step": 6977 }, { "epoch": 1.54, "learning_rate": 1.2999874494930004e-06, "logits/chosen": -2.3808608055114746, "logits/rejected": -2.448340892791748, "logps/chosen": -109.18009185791016, "logps/rejected": -92.283935546875, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 8.024149894714355, "rewards/margins": 3.0210676193237305, "rewards/rejected": 5.003082275390625, "step": 6978 }, { "epoch": 1.54, "learning_rate": 1.2987821600140237e-06, "logits/chosen": -1.8057035207748413, "logits/rejected": -1.719007968902588, "logps/chosen": -127.32698822021484, "logps/rejected": -61.994659423828125, "loss": 0.2449, "rewards/accuracies": 1.0, "rewards/chosen": 5.632395267486572, "rewards/margins": 2.331623077392578, "rewards/rejected": 3.300772190093994, "step": 6979 }, { "epoch": 1.54, "learning_rate": 1.2975773461326414e-06, "logits/chosen": -1.904663324356079, "logits/rejected": -1.8618628978729248, "logps/chosen": -69.27318572998047, "logps/rejected": -109.37001037597656, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": 7.541018009185791, "rewards/margins": 2.371628761291504, "rewards/rejected": 5.169389247894287, "step": 6980 }, { "epoch": 1.55, "learning_rate": 1.296373008003669e-06, "logits/chosen": -1.6669554710388184, "logits/rejected": -1.6840968132019043, "logps/chosen": -42.20935821533203, "logps/rejected": -24.762622833251953, "loss": 0.2593, "rewards/accuracies": 1.0, "rewards/chosen": 2.0340538024902344, "rewards/margins": 0.5819858312606812, "rewards/rejected": 1.4520679712295532, "step": 6981 }, { "epoch": 1.55, "learning_rate": 1.2951691457818621e-06, "logits/chosen": -2.014383316040039, "logits/rejected": -1.9796267747879028, "logps/chosen": -48.821189880371094, "logps/rejected": -34.83863067626953, "loss": 0.3577, "rewards/accuracies": 1.0, "rewards/chosen": 4.9507904052734375, "rewards/margins": 0.5862236022949219, "rewards/rejected": 4.364566802978516, "step": 6982 }, { "epoch": 1.55, "learning_rate": 1.293965759621914e-06, "logits/chosen": -1.8683139085769653, "logits/rejected": -1.8324178457260132, "logps/chosen": -59.408111572265625, "logps/rejected": -33.60467529296875, "loss": 0.651, "rewards/accuracies": 0.0, "rewards/chosen": 2.9190666675567627, "rewards/margins": -0.8277771472930908, "rewards/rejected": 3.7468438148498535, "step": 6983 }, { "epoch": 1.55, "learning_rate": 1.2927628496784566e-06, "logits/chosen": -2.2294232845306396, "logits/rejected": -2.1849915981292725, "logps/chosen": -78.63915252685547, "logps/rejected": -25.01187515258789, "loss": 0.1661, "rewards/accuracies": 1.0, "rewards/chosen": 4.073749542236328, "rewards/margins": 0.9368312358856201, "rewards/rejected": 3.136918306350708, "step": 6984 }, { "epoch": 1.55, "learning_rate": 1.2915604161060608e-06, "logits/chosen": -1.8049213886260986, "logits/rejected": -1.7819536924362183, "logps/chosen": -34.83595275878906, "logps/rejected": -51.06315612792969, "loss": 0.9245, "rewards/accuracies": 0.0, "rewards/chosen": 3.2377305030822754, "rewards/margins": -1.5158491134643555, "rewards/rejected": 4.753579616546631, "step": 6985 }, { "epoch": 1.55, "learning_rate": 1.2903584590592366e-06, "logits/chosen": -1.983941674232483, "logits/rejected": -1.9612864255905151, "logps/chosen": -86.22016906738281, "logps/rejected": -116.31873321533203, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": 10.654396057128906, "rewards/margins": 3.369326114654541, "rewards/rejected": 7.285069942474365, "step": 6986 }, { "epoch": 1.55, "learning_rate": 1.2891569786924334e-06, "logits/chosen": -2.0293445587158203, "logits/rejected": -2.0128257274627686, "logps/chosen": -44.50165557861328, "logps/rejected": -28.721511840820312, "loss": 0.273, "rewards/accuracies": 1.0, "rewards/chosen": 3.6278419494628906, "rewards/margins": 0.4614126682281494, "rewards/rejected": 3.166429281234741, "step": 6987 }, { "epoch": 1.55, "learning_rate": 1.2879559751600385e-06, "logits/chosen": -2.2066352367401123, "logits/rejected": -2.1903772354125977, "logps/chosen": -83.5693359375, "logps/rejected": -35.335933685302734, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": 3.8621323108673096, "rewards/margins": 2.541834831237793, "rewards/rejected": 1.3202975988388062, "step": 6988 }, { "epoch": 1.55, "learning_rate": 1.2867554486163764e-06, "logits/chosen": -1.879425287246704, "logits/rejected": -1.9399573802947998, "logps/chosen": -120.32589721679688, "logps/rejected": -108.7261962890625, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": 7.529167175292969, "rewards/margins": 1.7675094604492188, "rewards/rejected": 5.76165771484375, "step": 6989 }, { "epoch": 1.55, "learning_rate": 1.2855553992157127e-06, "logits/chosen": -1.9594758749008179, "logits/rejected": -1.8692636489868164, "logps/chosen": -54.14549255371094, "logps/rejected": -15.018647193908691, "loss": 0.3421, "rewards/accuracies": 1.0, "rewards/chosen": 2.639941453933716, "rewards/margins": 1.426850438117981, "rewards/rejected": 1.2130910158157349, "step": 6990 }, { "epoch": 1.55, "learning_rate": 1.2843558271122502e-06, "logits/chosen": -1.7966983318328857, "logits/rejected": -1.804645299911499, "logps/chosen": -34.858360290527344, "logps/rejected": -59.43339538574219, "loss": 0.6653, "rewards/accuracies": 0.0, "rewards/chosen": 2.881012439727783, "rewards/margins": -1.0111796855926514, "rewards/rejected": 3.8921921253204346, "step": 6991 }, { "epoch": 1.55, "learning_rate": 1.2831567324601325e-06, "logits/chosen": -2.2257936000823975, "logits/rejected": -2.185164451599121, "logps/chosen": -47.47383499145508, "logps/rejected": -74.11654663085938, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 4.89768648147583, "rewards/margins": 2.6358234882354736, "rewards/rejected": 2.2618629932403564, "step": 6992 }, { "epoch": 1.55, "learning_rate": 1.2819581154134397e-06, "logits/chosen": -1.7551897764205933, "logits/rejected": -1.7340596914291382, "logps/chosen": -70.19727325439453, "logps/rejected": -47.23578643798828, "loss": 0.3772, "rewards/accuracies": 1.0, "rewards/chosen": 4.605095863342285, "rewards/margins": 1.1573784351348877, "rewards/rejected": 3.4477174282073975, "step": 6993 }, { "epoch": 1.55, "learning_rate": 1.2807599761261907e-06, "logits/chosen": -1.897471308708191, "logits/rejected": -1.8652187585830688, "logps/chosen": -49.57011413574219, "logps/rejected": -32.492801666259766, "loss": 0.3658, "rewards/accuracies": 1.0, "rewards/chosen": 3.0796356201171875, "rewards/margins": 0.07577848434448242, "rewards/rejected": 3.003857135772705, "step": 6994 }, { "epoch": 1.55, "learning_rate": 1.2795623147523439e-06, "logits/chosen": -1.8991377353668213, "logits/rejected": -1.873926043510437, "logps/chosen": -45.660736083984375, "logps/rejected": -32.96656799316406, "loss": 0.3391, "rewards/accuracies": 1.0, "rewards/chosen": 3.2869980335235596, "rewards/margins": 0.9480500221252441, "rewards/rejected": 2.3389480113983154, "step": 6995 }, { "epoch": 1.55, "learning_rate": 1.2783651314457961e-06, "logits/chosen": -1.9358546733856201, "logits/rejected": -1.9379398822784424, "logps/chosen": -56.99258804321289, "logps/rejected": -59.75965881347656, "loss": 0.378, "rewards/accuracies": 0.0, "rewards/chosen": 4.889864921569824, "rewards/margins": -0.07314491271972656, "rewards/rejected": 4.963009834289551, "step": 6996 }, { "epoch": 1.55, "learning_rate": 1.277168426360383e-06, "logits/chosen": -1.6532011032104492, "logits/rejected": -1.6497918367385864, "logps/chosen": -45.09519958496094, "logps/rejected": -53.990535736083984, "loss": 0.7641, "rewards/accuracies": 0.0, "rewards/chosen": 2.579425096511841, "rewards/margins": -1.1821849346160889, "rewards/rejected": 3.7616100311279297, "step": 6997 }, { "epoch": 1.55, "learning_rate": 1.2759721996498786e-06, "logits/chosen": -1.925305724143982, "logits/rejected": -1.9105849266052246, "logps/chosen": -57.97758865356445, "logps/rejected": -29.337167739868164, "loss": 0.1474, "rewards/accuracies": 1.0, "rewards/chosen": 3.7692134380340576, "rewards/margins": 1.143479585647583, "rewards/rejected": 2.6257338523864746, "step": 6998 }, { "epoch": 1.55, "learning_rate": 1.2747764514679928e-06, "logits/chosen": -1.916685938835144, "logits/rejected": -1.916685938835144, "logps/chosen": -21.33407211303711, "logps/rejected": -21.33407211303711, "loss": 0.3749, "rewards/accuracies": 0.0, "rewards/chosen": 2.8984062671661377, "rewards/margins": 0.0, "rewards/rejected": 2.8984062671661377, "step": 6999 }, { "epoch": 1.55, "learning_rate": 1.2735811819683774e-06, "logits/chosen": -2.00115704536438, "logits/rejected": -2.00115704536438, "logps/chosen": -73.80961608886719, "logps/rejected": -73.80961608886719, "loss": 0.462, "rewards/accuracies": 0.0, "rewards/chosen": 4.825337886810303, "rewards/margins": 0.0, "rewards/rejected": 4.825337886810303, "step": 7000 }, { "epoch": 1.55, "learning_rate": 1.2723863913046224e-06, "logits/chosen": -2.2605602741241455, "logits/rejected": -2.2606089115142822, "logps/chosen": -63.229461669921875, "logps/rejected": -58.300506591796875, "loss": 0.5343, "rewards/accuracies": 1.0, "rewards/chosen": 5.439937114715576, "rewards/margins": 1.0732269287109375, "rewards/rejected": 4.366710186004639, "step": 7001 }, { "epoch": 1.55, "learning_rate": 1.2711920796302552e-06, "logits/chosen": -2.0133285522460938, "logits/rejected": -1.866540789604187, "logps/chosen": -45.51492691040039, "logps/rejected": -36.61122131347656, "loss": 0.4402, "rewards/accuracies": 0.0, "rewards/chosen": 3.8729054927825928, "rewards/margins": -0.30501580238342285, "rewards/rejected": 4.177921295166016, "step": 7002 }, { "epoch": 1.55, "learning_rate": 1.269998247098742e-06, "logits/chosen": -1.9783494472503662, "logits/rejected": -1.8861817121505737, "logps/chosen": -107.90548706054688, "logps/rejected": -70.79714965820312, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": 6.133827209472656, "rewards/margins": 2.972796678543091, "rewards/rejected": 3.1610305309295654, "step": 7003 }, { "epoch": 1.55, "learning_rate": 1.2688048938634873e-06, "logits/chosen": -2.0895209312438965, "logits/rejected": -2.0577149391174316, "logps/chosen": -57.264732360839844, "logps/rejected": -57.62187957763672, "loss": 0.199, "rewards/accuracies": 1.0, "rewards/chosen": 5.103882789611816, "rewards/margins": 1.0185165405273438, "rewards/rejected": 4.085366249084473, "step": 7004 }, { "epoch": 1.55, "learning_rate": 1.2676120200778342e-06, "logits/chosen": -2.213285446166992, "logits/rejected": -2.17024302482605, "logps/chosen": -121.57508087158203, "logps/rejected": -119.10652923583984, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 10.790645599365234, "rewards/margins": 2.28469181060791, "rewards/rejected": 8.505953788757324, "step": 7005 }, { "epoch": 1.55, "learning_rate": 1.266419625895064e-06, "logits/chosen": -1.6853257417678833, "logits/rejected": -1.6314228773117065, "logps/chosen": -33.50600814819336, "logps/rejected": -31.572948455810547, "loss": 0.4744, "rewards/accuracies": 1.0, "rewards/chosen": 3.6083226203918457, "rewards/margins": 1.233577013015747, "rewards/rejected": 2.3747456073760986, "step": 7006 }, { "epoch": 1.55, "learning_rate": 1.2652277114683986e-06, "logits/chosen": -1.9960730075836182, "logits/rejected": -2.031360626220703, "logps/chosen": -38.203086853027344, "logps/rejected": -96.37500762939453, "loss": 0.5277, "rewards/accuracies": 0.0, "rewards/chosen": 5.570497989654541, "rewards/margins": -0.1376481056213379, "rewards/rejected": 5.708146095275879, "step": 7007 }, { "epoch": 1.55, "learning_rate": 1.2640362769509905e-06, "logits/chosen": -1.8292444944381714, "logits/rejected": -1.7833653688430786, "logps/chosen": -59.25523376464844, "logps/rejected": -36.22300338745117, "loss": 0.1563, "rewards/accuracies": 1.0, "rewards/chosen": 5.1516265869140625, "rewards/margins": 1.0981707572937012, "rewards/rejected": 4.053455829620361, "step": 7008 }, { "epoch": 1.55, "learning_rate": 1.262845322495943e-06, "logits/chosen": -1.7348593473434448, "logits/rejected": -1.7348593473434448, "logps/chosen": -39.22613525390625, "logps/rejected": -39.22613525390625, "loss": 0.3554, "rewards/accuracies": 0.0, "rewards/chosen": 6.949427127838135, "rewards/margins": 0.0, "rewards/rejected": 6.949427127838135, "step": 7009 }, { "epoch": 1.55, "learning_rate": 1.2616548482562862e-06, "logits/chosen": -2.003896713256836, "logits/rejected": -2.0256617069244385, "logps/chosen": -115.32903289794922, "logps/rejected": -59.00366973876953, "loss": 0.0707, "rewards/accuracies": 1.0, "rewards/chosen": 7.722809791564941, "rewards/margins": 2.2893142700195312, "rewards/rejected": 5.43349552154541, "step": 7010 }, { "epoch": 1.55, "learning_rate": 1.2604648543849951e-06, "logits/chosen": -1.5822819471359253, "logits/rejected": -1.4521536827087402, "logps/chosen": -36.14978790283203, "logps/rejected": -10.681031227111816, "loss": 0.0939, "rewards/accuracies": 1.0, "rewards/chosen": 3.3048150539398193, "rewards/margins": 1.8785667419433594, "rewards/rejected": 1.42624831199646, "step": 7011 }, { "epoch": 1.55, "learning_rate": 1.25927534103498e-06, "logits/chosen": -1.9461687803268433, "logits/rejected": -1.897612452507019, "logps/chosen": -124.12963104248047, "logps/rejected": -73.48158264160156, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 8.131412506103516, "rewards/margins": 4.536879539489746, "rewards/rejected": 3.5945327281951904, "step": 7012 }, { "epoch": 1.55, "learning_rate": 1.2580863083590911e-06, "logits/chosen": -1.870773196220398, "logits/rejected": -1.9094195365905762, "logps/chosen": -42.79133987426758, "logps/rejected": -111.81678009033203, "loss": 1.3757, "rewards/accuracies": 0.0, "rewards/chosen": 3.430990219116211, "rewards/margins": -2.655435085296631, "rewards/rejected": 6.086425304412842, "step": 7013 }, { "epoch": 1.55, "learning_rate": 1.256897756510118e-06, "logits/chosen": -1.723479986190796, "logits/rejected": -1.656455159187317, "logps/chosen": -30.134918212890625, "logps/rejected": -47.544002532958984, "loss": 0.6644, "rewards/accuracies": 1.0, "rewards/chosen": 2.501152753829956, "rewards/margins": 0.07839846611022949, "rewards/rejected": 2.4227542877197266, "step": 7014 }, { "epoch": 1.55, "learning_rate": 1.2557096856407825e-06, "logits/chosen": -1.979762077331543, "logits/rejected": -1.948248028755188, "logps/chosen": -46.941864013671875, "logps/rejected": -15.527538299560547, "loss": 0.0961, "rewards/accuracies": 1.0, "rewards/chosen": 3.152806043624878, "rewards/margins": 2.759493589401245, "rewards/rejected": 0.3933124542236328, "step": 7015 }, { "epoch": 1.55, "learning_rate": 1.2545220959037535e-06, "logits/chosen": -1.7894197702407837, "logits/rejected": -1.7397496700286865, "logps/chosen": -73.5352554321289, "logps/rejected": -81.8360366821289, "loss": 0.0769, "rewards/accuracies": 1.0, "rewards/chosen": 8.1353178024292, "rewards/margins": 3.9216976165771484, "rewards/rejected": 4.213620185852051, "step": 7016 }, { "epoch": 1.55, "learning_rate": 1.2533349874516288e-06, "logits/chosen": -1.8986942768096924, "logits/rejected": -1.7860187292099, "logps/chosen": -70.97239685058594, "logps/rejected": -49.908851623535156, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": 4.317448616027832, "rewards/margins": 1.4880869388580322, "rewards/rejected": 2.8293616771698, "step": 7017 }, { "epoch": 1.55, "learning_rate": 1.2521483604369544e-06, "logits/chosen": -1.987608551979065, "logits/rejected": -1.9580109119415283, "logps/chosen": -47.9532356262207, "logps/rejected": -59.56175994873047, "loss": 0.126, "rewards/accuracies": 1.0, "rewards/chosen": 3.2619640827178955, "rewards/margins": 1.2888050079345703, "rewards/rejected": 1.9731590747833252, "step": 7018 }, { "epoch": 1.55, "learning_rate": 1.2509622150122047e-06, "logits/chosen": -1.8466989994049072, "logits/rejected": -1.8466989994049072, "logps/chosen": -29.333171844482422, "logps/rejected": -29.333171844482422, "loss": 0.3851, "rewards/accuracies": 0.0, "rewards/chosen": 2.6709325313568115, "rewards/margins": 0.0, "rewards/rejected": 2.6709325313568115, "step": 7019 }, { "epoch": 1.55, "learning_rate": 1.2497765513297976e-06, "logits/chosen": -1.9479382038116455, "logits/rejected": -1.817251205444336, "logps/chosen": -87.97654724121094, "logps/rejected": -60.969913482666016, "loss": 0.076, "rewards/accuracies": 1.0, "rewards/chosen": 5.9833574295043945, "rewards/margins": 1.997222661972046, "rewards/rejected": 3.9861347675323486, "step": 7020 }, { "epoch": 1.55, "learning_rate": 1.2485913695420887e-06, "logits/chosen": -1.9874995946884155, "logits/rejected": -1.96002197265625, "logps/chosen": -38.25809860229492, "logps/rejected": -55.4994010925293, "loss": 0.1682, "rewards/accuracies": 1.0, "rewards/chosen": 3.348755359649658, "rewards/margins": 0.9476876258850098, "rewards/rejected": 2.4010677337646484, "step": 7021 }, { "epoch": 1.55, "learning_rate": 1.2474066698013703e-06, "logits/chosen": -1.6451473236083984, "logits/rejected": -1.5956777334213257, "logps/chosen": -67.92901611328125, "logps/rejected": -37.07795333862305, "loss": 0.2589, "rewards/accuracies": 1.0, "rewards/chosen": 2.3246209621429443, "rewards/margins": 0.5311635732650757, "rewards/rejected": 1.7934573888778687, "step": 7022 }, { "epoch": 1.55, "learning_rate": 1.246222452259876e-06, "logits/chosen": -2.0074918270111084, "logits/rejected": -2.0087082386016846, "logps/chosen": -100.06236267089844, "logps/rejected": -185.19891357421875, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 13.296562194824219, "rewards/margins": 3.103398323059082, "rewards/rejected": 10.193163871765137, "step": 7023 }, { "epoch": 1.55, "learning_rate": 1.2450387170697691e-06, "logits/chosen": -2.1197848320007324, "logits/rejected": -2.0956339836120605, "logps/chosen": -42.239410400390625, "logps/rejected": -36.31903076171875, "loss": 0.1179, "rewards/accuracies": 1.0, "rewards/chosen": 3.9487550258636475, "rewards/margins": 1.6406269073486328, "rewards/rejected": 2.3081281185150146, "step": 7024 }, { "epoch": 1.55, "learning_rate": 1.2438554643831635e-06, "logits/chosen": -1.9310343265533447, "logits/rejected": -1.9350250959396362, "logps/chosen": -43.263343811035156, "logps/rejected": -69.7409896850586, "loss": 0.3399, "rewards/accuracies": 1.0, "rewards/chosen": 4.611702919006348, "rewards/margins": 1.7133057117462158, "rewards/rejected": 2.898397207260132, "step": 7025 }, { "epoch": 1.56, "learning_rate": 1.2426726943520983e-06, "logits/chosen": -1.9733694791793823, "logits/rejected": -1.9494603872299194, "logps/chosen": -62.05014419555664, "logps/rejected": -51.08085632324219, "loss": 0.9264, "rewards/accuracies": 1.0, "rewards/chosen": 3.716911792755127, "rewards/margins": 0.6019749641418457, "rewards/rejected": 3.1149368286132812, "step": 7026 }, { "epoch": 1.56, "learning_rate": 1.2414904071285628e-06, "logits/chosen": -2.176210403442383, "logits/rejected": -2.176210403442383, "logps/chosen": -49.54493713378906, "logps/rejected": -49.54493713378906, "loss": 0.3558, "rewards/accuracies": 0.0, "rewards/chosen": 4.927392482757568, "rewards/margins": 0.0, "rewards/rejected": 4.927392482757568, "step": 7027 }, { "epoch": 1.56, "learning_rate": 1.2403086028644728e-06, "logits/chosen": -1.8016775846481323, "logits/rejected": -1.7896099090576172, "logps/chosen": -35.04126739501953, "logps/rejected": -52.03256607055664, "loss": 0.327, "rewards/accuracies": 1.0, "rewards/chosen": 3.439910888671875, "rewards/margins": 1.129626750946045, "rewards/rejected": 2.31028413772583, "step": 7028 }, { "epoch": 1.56, "learning_rate": 1.2391272817116895e-06, "logits/chosen": -1.760033130645752, "logits/rejected": -1.7647908926010132, "logps/chosen": -33.14382553100586, "logps/rejected": -38.27433776855469, "loss": 0.2423, "rewards/accuracies": 1.0, "rewards/chosen": 2.8739736080169678, "rewards/margins": 0.5514371395111084, "rewards/rejected": 2.3225364685058594, "step": 7029 }, { "epoch": 1.56, "learning_rate": 1.2379464438220095e-06, "logits/chosen": -1.9135985374450684, "logits/rejected": -1.8715633153915405, "logps/chosen": -30.39228630065918, "logps/rejected": -39.999820709228516, "loss": 0.1646, "rewards/accuracies": 1.0, "rewards/chosen": 2.9630048274993896, "rewards/margins": 1.0724228620529175, "rewards/rejected": 1.8905819654464722, "step": 7030 }, { "epoch": 1.56, "learning_rate": 1.2367660893471684e-06, "logits/chosen": -1.7011396884918213, "logits/rejected": -1.6316475868225098, "logps/chosen": -35.82378005981445, "logps/rejected": -23.014904022216797, "loss": 1.045, "rewards/accuracies": 1.0, "rewards/chosen": 1.2994972467422485, "rewards/margins": 0.6462753415107727, "rewards/rejected": 0.6532219052314758, "step": 7031 }, { "epoch": 1.56, "learning_rate": 1.235586218438839e-06, "logits/chosen": -1.9497517347335815, "logits/rejected": -1.9633498191833496, "logps/chosen": -40.03438949584961, "logps/rejected": -111.0445556640625, "loss": 0.2066, "rewards/accuracies": 1.0, "rewards/chosen": 7.812158107757568, "rewards/margins": 1.0585737228393555, "rewards/rejected": 6.753584384918213, "step": 7032 }, { "epoch": 1.56, "learning_rate": 1.2344068312486284e-06, "logits/chosen": -1.9680449962615967, "logits/rejected": -1.9855369329452515, "logps/chosen": -35.26781463623047, "logps/rejected": -38.17448043823242, "loss": 0.4042, "rewards/accuracies": 0.0, "rewards/chosen": 3.170966386795044, "rewards/margins": -0.20197343826293945, "rewards/rejected": 3.3729398250579834, "step": 7033 }, { "epoch": 1.56, "learning_rate": 1.2332279279280907e-06, "logits/chosen": -1.8438663482666016, "logits/rejected": -1.6122758388519287, "logps/chosen": -130.70152282714844, "logps/rejected": -66.02490234375, "loss": 0.0757, "rewards/accuracies": 1.0, "rewards/chosen": 7.398988246917725, "rewards/margins": 1.859997272491455, "rewards/rejected": 5.5389909744262695, "step": 7034 }, { "epoch": 1.56, "learning_rate": 1.232049508628706e-06, "logits/chosen": -1.7546969652175903, "logits/rejected": -1.7545944452285767, "logps/chosen": -27.986736297607422, "logps/rejected": -52.642921447753906, "loss": 0.3274, "rewards/accuracies": 1.0, "rewards/chosen": 2.962841510772705, "rewards/margins": 0.22303056716918945, "rewards/rejected": 2.7398109436035156, "step": 7035 }, { "epoch": 1.56, "learning_rate": 1.230871573501905e-06, "logits/chosen": -1.8504467010498047, "logits/rejected": -1.8458967208862305, "logps/chosen": -25.176280975341797, "logps/rejected": -55.18938446044922, "loss": 0.3081, "rewards/accuracies": 1.0, "rewards/chosen": 3.345487356185913, "rewards/margins": 0.17297601699829102, "rewards/rejected": 3.172511339187622, "step": 7036 }, { "epoch": 1.56, "learning_rate": 1.229694122699044e-06, "logits/chosen": -1.857151985168457, "logits/rejected": -1.6144495010375977, "logps/chosen": -97.80717468261719, "logps/rejected": -19.644088745117188, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": 7.749955654144287, "rewards/margins": 7.511714935302734, "rewards/rejected": 0.2382408231496811, "step": 7037 }, { "epoch": 1.56, "learning_rate": 1.228517156371425e-06, "logits/chosen": -1.6196447610855103, "logits/rejected": -1.5088849067687988, "logps/chosen": -26.071136474609375, "logps/rejected": -10.23598861694336, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 3.030371904373169, "rewards/margins": 2.421931028366089, "rewards/rejected": 0.6084408164024353, "step": 7038 }, { "epoch": 1.56, "learning_rate": 1.2273406746702847e-06, "logits/chosen": -2.1657166481018066, "logits/rejected": -2.0795211791992188, "logps/chosen": -141.52133178710938, "logps/rejected": -44.869842529296875, "loss": 0.0558, "rewards/accuracies": 1.0, "rewards/chosen": 6.036306858062744, "rewards/margins": 2.1882264614105225, "rewards/rejected": 3.8480803966522217, "step": 7039 }, { "epoch": 1.56, "learning_rate": 1.226164677746799e-06, "logits/chosen": -2.0152997970581055, "logits/rejected": -1.9900624752044678, "logps/chosen": -34.58274459838867, "logps/rejected": -69.25521850585938, "loss": 0.2576, "rewards/accuracies": 1.0, "rewards/chosen": 4.433684349060059, "rewards/margins": 1.2608022689819336, "rewards/rejected": 3.172882080078125, "step": 7040 }, { "epoch": 1.56, "learning_rate": 1.2249891657520813e-06, "logits/chosen": -2.1237804889678955, "logits/rejected": -2.103219747543335, "logps/chosen": -87.35078430175781, "logps/rejected": -64.18299102783203, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 4.818934917449951, "rewards/margins": 3.0337166786193848, "rewards/rejected": 1.7852181196212769, "step": 7041 }, { "epoch": 1.56, "learning_rate": 1.2238141388371782e-06, "logits/chosen": -1.7101588249206543, "logits/rejected": -1.696288824081421, "logps/chosen": -64.61790466308594, "logps/rejected": -58.12274169921875, "loss": 0.1207, "rewards/accuracies": 1.0, "rewards/chosen": 3.594381809234619, "rewards/margins": 1.9335602521896362, "rewards/rejected": 1.660821557044983, "step": 7042 }, { "epoch": 1.56, "learning_rate": 1.2226395971530835e-06, "logits/chosen": -1.9773863554000854, "logits/rejected": -1.8542271852493286, "logps/chosen": -114.93862915039062, "logps/rejected": -41.767879486083984, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": 6.746194362640381, "rewards/margins": 4.594438552856445, "rewards/rejected": 2.1517560482025146, "step": 7043 }, { "epoch": 1.56, "learning_rate": 1.221465540850717e-06, "logits/chosen": -1.6675231456756592, "logits/rejected": -1.6422544717788696, "logps/chosen": -42.81346130371094, "logps/rejected": -33.32427215576172, "loss": 0.1004, "rewards/accuracies": 1.0, "rewards/chosen": 4.1551408767700195, "rewards/margins": 1.8306963443756104, "rewards/rejected": 2.324444532394409, "step": 7044 }, { "epoch": 1.56, "learning_rate": 1.2202919700809478e-06, "logits/chosen": -1.9275988340377808, "logits/rejected": -1.9275988340377808, "logps/chosen": -63.937400817871094, "logps/rejected": -63.937400817871094, "loss": 0.3482, "rewards/accuracies": 0.0, "rewards/chosen": 3.069049835205078, "rewards/margins": 0.0, "rewards/rejected": 3.069049835205078, "step": 7045 }, { "epoch": 1.56, "learning_rate": 1.2191188849945734e-06, "logits/chosen": -2.1019294261932373, "logits/rejected": -2.079335927963257, "logps/chosen": -53.27693176269531, "logps/rejected": -84.4647445678711, "loss": 0.4221, "rewards/accuracies": 1.0, "rewards/chosen": 4.3635454177856445, "rewards/margins": 1.814054250717163, "rewards/rejected": 2.5494911670684814, "step": 7046 }, { "epoch": 1.56, "learning_rate": 1.217946285742333e-06, "logits/chosen": -1.9148846864700317, "logits/rejected": -1.9627244472503662, "logps/chosen": -48.63200378417969, "logps/rejected": -55.667938232421875, "loss": 2.1559, "rewards/accuracies": 0.0, "rewards/chosen": 4.120560646057129, "rewards/margins": -3.9969120025634766, "rewards/rejected": 8.117472648620605, "step": 7047 }, { "epoch": 1.56, "learning_rate": 1.2167741724749026e-06, "logits/chosen": -1.9478391408920288, "logits/rejected": -1.8818657398223877, "logps/chosen": -81.60379028320312, "logps/rejected": -17.2232608795166, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": 3.364840030670166, "rewards/margins": 1.4422392845153809, "rewards/rejected": 1.9226007461547852, "step": 7048 }, { "epoch": 1.56, "learning_rate": 1.2156025453428972e-06, "logits/chosen": -1.9744116067886353, "logits/rejected": -2.0031898021698, "logps/chosen": -34.86967086791992, "logps/rejected": -49.22296142578125, "loss": 0.3102, "rewards/accuracies": 1.0, "rewards/chosen": 3.4566638469696045, "rewards/margins": 0.2067873477935791, "rewards/rejected": 3.2498764991760254, "step": 7049 }, { "epoch": 1.56, "learning_rate": 1.214431404496868e-06, "logits/chosen": -1.8696666955947876, "logits/rejected": -1.8062416315078735, "logps/chosen": -67.55278778076172, "logps/rejected": -70.2009506225586, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 4.001687049865723, "rewards/margins": 2.60060977935791, "rewards/rejected": 1.4010772705078125, "step": 7050 }, { "epoch": 1.56, "learning_rate": 1.2132607500873005e-06, "logits/chosen": -1.7120429277420044, "logits/rejected": -1.7066802978515625, "logps/chosen": -61.90778350830078, "logps/rejected": -71.85438537597656, "loss": 0.24, "rewards/accuracies": 1.0, "rewards/chosen": 3.6907958984375, "rewards/margins": 0.8673491477966309, "rewards/rejected": 2.823446750640869, "step": 7051 }, { "epoch": 1.56, "learning_rate": 1.2120905822646268e-06, "logits/chosen": -1.99616277217865, "logits/rejected": -1.9758833646774292, "logps/chosen": -115.77850341796875, "logps/rejected": -67.02336883544922, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": 5.200904846191406, "rewards/margins": 3.1459364891052246, "rewards/rejected": 2.0549683570861816, "step": 7052 }, { "epoch": 1.56, "learning_rate": 1.2109209011792038e-06, "logits/chosen": -1.6358857154846191, "logits/rejected": -1.5619438886642456, "logps/chosen": -60.77276611328125, "logps/rejected": -58.47622299194336, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 7.415256023406982, "rewards/margins": 6.492733001708984, "rewards/rejected": 0.9225231409072876, "step": 7053 }, { "epoch": 1.56, "learning_rate": 1.2097517069813402e-06, "logits/chosen": -1.8921171426773071, "logits/rejected": -1.850080132484436, "logps/chosen": -47.93419647216797, "logps/rejected": -42.35615539550781, "loss": 0.3597, "rewards/accuracies": 1.0, "rewards/chosen": 2.682077169418335, "rewards/margins": 0.1390221118927002, "rewards/rejected": 2.5430550575256348, "step": 7054 }, { "epoch": 1.56, "learning_rate": 1.2085829998212683e-06, "logits/chosen": -1.956242322921753, "logits/rejected": -1.9370063543319702, "logps/chosen": -39.43293762207031, "logps/rejected": -62.165679931640625, "loss": 0.1036, "rewards/accuracies": 1.0, "rewards/chosen": 4.014549255371094, "rewards/margins": 1.9384727478027344, "rewards/rejected": 2.0760765075683594, "step": 7055 }, { "epoch": 1.56, "learning_rate": 1.207414779849167e-06, "logits/chosen": -1.9866372346878052, "logits/rejected": -1.9374557733535767, "logps/chosen": -50.21464538574219, "logps/rejected": -53.22967529296875, "loss": 0.1184, "rewards/accuracies": 1.0, "rewards/chosen": 4.0190958976745605, "rewards/margins": 1.8381991386413574, "rewards/rejected": 2.180896759033203, "step": 7056 }, { "epoch": 1.56, "learning_rate": 1.2062470472151494e-06, "logits/chosen": -2.137022018432617, "logits/rejected": -2.1976234912872314, "logps/chosen": -35.27366638183594, "logps/rejected": -79.55607604980469, "loss": 1.8069, "rewards/accuracies": 0.0, "rewards/chosen": 4.959569454193115, "rewards/margins": -3.5505166053771973, "rewards/rejected": 8.510086059570312, "step": 7057 }, { "epoch": 1.56, "learning_rate": 1.2050798020692667e-06, "logits/chosen": -1.875573754310608, "logits/rejected": -1.683428168296814, "logps/chosen": -92.46371459960938, "logps/rejected": -210.67596435546875, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 9.144490242004395, "rewards/margins": 2.8212294578552246, "rewards/rejected": 6.32326078414917, "step": 7058 }, { "epoch": 1.56, "learning_rate": 1.203913044561506e-06, "logits/chosen": -1.5988624095916748, "logits/rejected": -1.7023392915725708, "logps/chosen": -6.624128818511963, "logps/rejected": -56.13655471801758, "loss": 3.2335, "rewards/accuracies": 0.0, "rewards/chosen": 0.5888891816139221, "rewards/margins": -5.77072286605835, "rewards/rejected": 6.359611988067627, "step": 7059 }, { "epoch": 1.56, "learning_rate": 1.2027467748417936e-06, "logits/chosen": -2.100435733795166, "logits/rejected": -2.1017842292785645, "logps/chosen": -54.902503967285156, "logps/rejected": -62.366615295410156, "loss": 0.2506, "rewards/accuracies": 1.0, "rewards/chosen": 2.979799747467041, "rewards/margins": 0.592864990234375, "rewards/rejected": 2.386934757232666, "step": 7060 }, { "epoch": 1.56, "learning_rate": 1.2015809930599941e-06, "logits/chosen": -1.9635900259017944, "logits/rejected": -1.886813759803772, "logps/chosen": -108.77130889892578, "logps/rejected": -54.74046325683594, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 6.720321178436279, "rewards/margins": 2.80831241607666, "rewards/rejected": 3.912008762359619, "step": 7061 }, { "epoch": 1.56, "learning_rate": 1.2004156993659028e-06, "logits/chosen": -1.9701820611953735, "logits/rejected": -1.970170021057129, "logps/chosen": -38.00004196166992, "logps/rejected": -102.41596984863281, "loss": 0.4479, "rewards/accuracies": 0.0, "rewards/chosen": 3.845895767211914, "rewards/margins": -0.3402552604675293, "rewards/rejected": 4.186151027679443, "step": 7062 }, { "epoch": 1.56, "learning_rate": 1.1992508939092634e-06, "logits/chosen": -1.8306009769439697, "logits/rejected": -1.7954657077789307, "logps/chosen": -204.30325317382812, "logps/rejected": -72.95991516113281, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 9.420419692993164, "rewards/margins": 3.952108860015869, "rewards/rejected": 5.468310832977295, "step": 7063 }, { "epoch": 1.56, "learning_rate": 1.1980865768397453e-06, "logits/chosen": -1.9210152626037598, "logits/rejected": -1.9348005056381226, "logps/chosen": -47.81629943847656, "logps/rejected": -90.29110717773438, "loss": 2.0417, "rewards/accuracies": 0.0, "rewards/chosen": 5.370578289031982, "rewards/margins": -1.6201324462890625, "rewards/rejected": 6.990710735321045, "step": 7064 }, { "epoch": 1.56, "learning_rate": 1.196922748306963e-06, "logits/chosen": -1.7442243099212646, "logits/rejected": -1.6960062980651855, "logps/chosen": -25.49704360961914, "logps/rejected": -61.23432922363281, "loss": 0.2509, "rewards/accuracies": 1.0, "rewards/chosen": 4.279465198516846, "rewards/margins": 0.446491003036499, "rewards/rejected": 3.8329741954803467, "step": 7065 }, { "epoch": 1.56, "learning_rate": 1.195759408460465e-06, "logits/chosen": -1.8860905170440674, "logits/rejected": -1.816532015800476, "logps/chosen": -44.96865463256836, "logps/rejected": -15.678008079528809, "loss": 0.3422, "rewards/accuracies": 1.0, "rewards/chosen": 1.6610809564590454, "rewards/margins": 0.5210515260696411, "rewards/rejected": 1.1400294303894043, "step": 7066 }, { "epoch": 1.56, "learning_rate": 1.1945965574497381e-06, "logits/chosen": -1.8753652572631836, "logits/rejected": -1.8152598142623901, "logps/chosen": -51.22389602661133, "logps/rejected": -19.773395538330078, "loss": 0.278, "rewards/accuracies": 1.0, "rewards/chosen": 2.810471773147583, "rewards/margins": 0.8000483512878418, "rewards/rejected": 2.010423421859741, "step": 7067 }, { "epoch": 1.56, "learning_rate": 1.193434195424205e-06, "logits/chosen": -1.6821383237838745, "logits/rejected": -1.627959132194519, "logps/chosen": -49.910945892333984, "logps/rejected": -36.39098358154297, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": 4.032044887542725, "rewards/margins": 2.02408504486084, "rewards/rejected": 2.0079598426818848, "step": 7068 }, { "epoch": 1.56, "learning_rate": 1.1922723225332278e-06, "logits/chosen": -1.9150985479354858, "logits/rejected": -1.834102988243103, "logps/chosen": -46.24934768676758, "logps/rejected": -21.505313873291016, "loss": 0.0599, "rewards/accuracies": 1.0, "rewards/chosen": 2.510894536972046, "rewards/margins": 2.1604487895965576, "rewards/rejected": 0.3504457473754883, "step": 7069 }, { "epoch": 1.56, "learning_rate": 1.191110938926105e-06, "logits/chosen": -1.8481532335281372, "logits/rejected": -1.6536808013916016, "logps/chosen": -100.3515625, "logps/rejected": -20.058265686035156, "loss": 0.1129, "rewards/accuracies": 1.0, "rewards/chosen": 2.567675828933716, "rewards/margins": 1.7257471084594727, "rewards/rejected": 0.8419286608695984, "step": 7070 }, { "epoch": 1.57, "learning_rate": 1.1899500447520667e-06, "logits/chosen": -1.8644417524337769, "logits/rejected": -1.831248164176941, "logps/chosen": -43.13092803955078, "logps/rejected": -41.984092712402344, "loss": 0.4298, "rewards/accuracies": 1.0, "rewards/chosen": 3.542208194732666, "rewards/margins": 1.6356278657913208, "rewards/rejected": 1.9065803289413452, "step": 7071 }, { "epoch": 1.57, "learning_rate": 1.1887896401602922e-06, "logits/chosen": -2.175840377807617, "logits/rejected": -2.075906753540039, "logps/chosen": -169.45361328125, "logps/rejected": -35.060279846191406, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 9.950433731079102, "rewards/margins": 7.3411865234375, "rewards/rejected": 2.6092469692230225, "step": 7072 }, { "epoch": 1.57, "learning_rate": 1.1876297252998852e-06, "logits/chosen": -1.9602640867233276, "logits/rejected": -1.981284737586975, "logps/chosen": -68.11163330078125, "logps/rejected": -88.42115783691406, "loss": 0.3983, "rewards/accuracies": 0.0, "rewards/chosen": 7.128808498382568, "rewards/margins": -0.12439298629760742, "rewards/rejected": 7.253201484680176, "step": 7073 }, { "epoch": 1.57, "learning_rate": 1.1864703003198935e-06, "logits/chosen": -2.0561180114746094, "logits/rejected": -1.9874458312988281, "logps/chosen": -49.49537658691406, "logps/rejected": -28.585002899169922, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 3.1221084594726562, "rewards/margins": 2.949566125869751, "rewards/rejected": 0.17254237830638885, "step": 7074 }, { "epoch": 1.57, "learning_rate": 1.185311365369301e-06, "logits/chosen": -1.6671143770217896, "logits/rejected": -1.521998643875122, "logps/chosen": -74.82563781738281, "logps/rejected": -84.8652114868164, "loss": 0.0966, "rewards/accuracies": 1.0, "rewards/chosen": 6.490797519683838, "rewards/margins": 1.8104639053344727, "rewards/rejected": 4.680333614349365, "step": 7075 }, { "epoch": 1.57, "learning_rate": 1.1841529205970281e-06, "logits/chosen": -1.7600867748260498, "logits/rejected": -1.7600867748260498, "logps/chosen": -33.44123840332031, "logps/rejected": -33.44123840332031, "loss": 0.3502, "rewards/accuracies": 0.0, "rewards/chosen": 4.555578708648682, "rewards/margins": 0.0, "rewards/rejected": 4.555578708648682, "step": 7076 }, { "epoch": 1.57, "learning_rate": 1.182994966151932e-06, "logits/chosen": -1.7225825786590576, "logits/rejected": -1.7818392515182495, "logps/chosen": -28.855213165283203, "logps/rejected": -76.79949951171875, "loss": 1.145, "rewards/accuracies": 0.0, "rewards/chosen": 3.1061465740203857, "rewards/margins": -2.173626661300659, "rewards/rejected": 5.279773235321045, "step": 7077 }, { "epoch": 1.57, "learning_rate": 1.1818375021828066e-06, "logits/chosen": -2.236199140548706, "logits/rejected": -2.196279764175415, "logps/chosen": -123.01522064208984, "logps/rejected": -49.998374938964844, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 7.746784210205078, "rewards/margins": 5.4277801513671875, "rewards/rejected": 2.3190040588378906, "step": 7078 }, { "epoch": 1.57, "learning_rate": 1.1806805288383833e-06, "logits/chosen": -1.821558952331543, "logits/rejected": -1.6931639909744263, "logps/chosen": -39.87356185913086, "logps/rejected": -21.18312644958496, "loss": 0.1618, "rewards/accuracies": 1.0, "rewards/chosen": 1.9672985076904297, "rewards/margins": 1.2138230800628662, "rewards/rejected": 0.7534753680229187, "step": 7079 }, { "epoch": 1.57, "learning_rate": 1.1795240462673324e-06, "logits/chosen": -1.6951816082000732, "logits/rejected": -1.6951816082000732, "logps/chosen": -28.554330825805664, "logps/rejected": -28.554330825805664, "loss": 0.5434, "rewards/accuracies": 0.0, "rewards/chosen": 4.873652935028076, "rewards/margins": 0.0, "rewards/rejected": 4.873652935028076, "step": 7080 }, { "epoch": 1.57, "learning_rate": 1.1783680546182557e-06, "logits/chosen": -2.1348912715911865, "logits/rejected": -1.795656442642212, "logps/chosen": -90.59909057617188, "logps/rejected": -66.25080871582031, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 7.903918743133545, "rewards/margins": 3.1599597930908203, "rewards/rejected": 4.743958950042725, "step": 7081 }, { "epoch": 1.57, "learning_rate": 1.1772125540396972e-06, "logits/chosen": -1.7132128477096558, "logits/rejected": -1.7132128477096558, "logps/chosen": -60.16973114013672, "logps/rejected": -60.16973114013672, "loss": 0.3796, "rewards/accuracies": 0.0, "rewards/chosen": 6.750522613525391, "rewards/margins": 0.0, "rewards/rejected": 6.750522613525391, "step": 7082 }, { "epoch": 1.57, "learning_rate": 1.176057544680136e-06, "logits/chosen": -1.828616738319397, "logits/rejected": -1.8511000871658325, "logps/chosen": -17.452228546142578, "logps/rejected": -41.22186279296875, "loss": 0.3705, "rewards/accuracies": 1.0, "rewards/chosen": 3.1553852558135986, "rewards/margins": 0.025294780731201172, "rewards/rejected": 3.1300904750823975, "step": 7083 }, { "epoch": 1.57, "learning_rate": 1.1749030266879874e-06, "logits/chosen": -1.6345230340957642, "logits/rejected": -1.6542818546295166, "logps/chosen": -42.100196838378906, "logps/rejected": -39.19081497192383, "loss": 0.6358, "rewards/accuracies": 0.0, "rewards/chosen": 2.508075714111328, "rewards/margins": -0.03135275840759277, "rewards/rejected": 2.539428472518921, "step": 7084 }, { "epoch": 1.57, "learning_rate": 1.1737490002116047e-06, "logits/chosen": -1.6490803956985474, "logits/rejected": -1.6490803956985474, "logps/chosen": -47.95674133300781, "logps/rejected": -47.95674133300781, "loss": 0.3471, "rewards/accuracies": 0.0, "rewards/chosen": 5.899209499359131, "rewards/margins": 0.0, "rewards/rejected": 5.899209499359131, "step": 7085 }, { "epoch": 1.57, "learning_rate": 1.1725954653992777e-06, "logits/chosen": -2.08510422706604, "logits/rejected": -2.0317816734313965, "logps/chosen": -110.87647247314453, "logps/rejected": -84.7598876953125, "loss": 0.1115, "rewards/accuracies": 1.0, "rewards/chosen": 6.302056312561035, "rewards/margins": 1.7676324844360352, "rewards/rejected": 4.534423828125, "step": 7086 }, { "epoch": 1.57, "learning_rate": 1.171442422399232e-06, "logits/chosen": -1.858155369758606, "logits/rejected": -1.8312304019927979, "logps/chosen": -44.32728958129883, "logps/rejected": -61.05818557739258, "loss": 0.2992, "rewards/accuracies": 1.0, "rewards/chosen": 4.7329936027526855, "rewards/margins": 0.20722198486328125, "rewards/rejected": 4.525771617889404, "step": 7087 }, { "epoch": 1.57, "learning_rate": 1.1702898713596322e-06, "logits/chosen": -1.8019678592681885, "logits/rejected": -1.731526494026184, "logps/chosen": -72.9207763671875, "logps/rejected": -37.84946823120117, "loss": 0.0952, "rewards/accuracies": 1.0, "rewards/chosen": 4.061903476715088, "rewards/margins": 2.148047685623169, "rewards/rejected": 1.913855791091919, "step": 7088 }, { "epoch": 1.57, "learning_rate": 1.1691378124285785e-06, "logits/chosen": -2.23751163482666, "logits/rejected": -1.7630847692489624, "logps/chosen": -65.78053283691406, "logps/rejected": -77.80432891845703, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 8.205604553222656, "rewards/margins": 4.07007360458374, "rewards/rejected": 4.135530948638916, "step": 7089 }, { "epoch": 1.57, "learning_rate": 1.1679862457541052e-06, "logits/chosen": -1.8609510660171509, "logits/rejected": -1.7872107028961182, "logps/chosen": -79.12495422363281, "logps/rejected": -21.357379913330078, "loss": 0.4887, "rewards/accuracies": 1.0, "rewards/chosen": 1.6778572797775269, "rewards/margins": 0.7576004862785339, "rewards/rejected": 0.9202567934989929, "step": 7090 }, { "epoch": 1.57, "learning_rate": 1.166835171484188e-06, "logits/chosen": -2.141078472137451, "logits/rejected": -2.1660711765289307, "logps/chosen": -111.05514526367188, "logps/rejected": -149.29940795898438, "loss": 0.1187, "rewards/accuracies": 1.0, "rewards/chosen": 11.6212158203125, "rewards/margins": 1.3320674896240234, "rewards/rejected": 10.289148330688477, "step": 7091 }, { "epoch": 1.57, "learning_rate": 1.1656845897667364e-06, "logits/chosen": -1.9121724367141724, "logits/rejected": -1.85834538936615, "logps/chosen": -53.526824951171875, "logps/rejected": -46.605712890625, "loss": 0.0879, "rewards/accuracies": 1.0, "rewards/chosen": 6.057226657867432, "rewards/margins": 3.398089647293091, "rewards/rejected": 2.659137010574341, "step": 7092 }, { "epoch": 1.57, "learning_rate": 1.1645345007495973e-06, "logits/chosen": -1.974104881286621, "logits/rejected": -1.9681329727172852, "logps/chosen": -34.62629699707031, "logps/rejected": -68.24446105957031, "loss": 0.178, "rewards/accuracies": 1.0, "rewards/chosen": 3.886212110519409, "rewards/margins": 0.8958556652069092, "rewards/rejected": 2.9903564453125, "step": 7093 }, { "epoch": 1.57, "learning_rate": 1.1633849045805545e-06, "logits/chosen": -2.0556466579437256, "logits/rejected": -2.0584583282470703, "logps/chosen": -42.79846954345703, "logps/rejected": -73.52928161621094, "loss": 0.6495, "rewards/accuracies": 0.0, "rewards/chosen": 4.9959588050842285, "rewards/margins": -0.7945547103881836, "rewards/rejected": 5.790513515472412, "step": 7094 }, { "epoch": 1.57, "learning_rate": 1.1622358014073286e-06, "logits/chosen": -1.9649721384048462, "logits/rejected": -1.9189404249191284, "logps/chosen": -78.10652160644531, "logps/rejected": -58.371116638183594, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 7.757228374481201, "rewards/margins": 4.523533821105957, "rewards/rejected": 3.233694553375244, "step": 7095 }, { "epoch": 1.57, "learning_rate": 1.1610871913775768e-06, "logits/chosen": -1.690875768661499, "logits/rejected": -1.6690996885299683, "logps/chosen": -50.45630645751953, "logps/rejected": -35.80805969238281, "loss": 0.3712, "rewards/accuracies": 0.0, "rewards/chosen": 4.181412696838379, "rewards/margins": -0.02466726303100586, "rewards/rejected": 4.206079959869385, "step": 7096 }, { "epoch": 1.57, "learning_rate": 1.1599390746388921e-06, "logits/chosen": -1.8056859970092773, "logits/rejected": -1.7897543907165527, "logps/chosen": -107.14766693115234, "logps/rejected": -49.54431915283203, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": 8.849745750427246, "rewards/margins": 2.943824291229248, "rewards/rejected": 5.905921459197998, "step": 7097 }, { "epoch": 1.57, "learning_rate": 1.1587914513388067e-06, "logits/chosen": -2.203641891479492, "logits/rejected": -2.203641891479492, "logps/chosen": -38.253150939941406, "logps/rejected": -38.253150939941406, "loss": 0.6046, "rewards/accuracies": 0.0, "rewards/chosen": 4.492226600646973, "rewards/margins": 0.0, "rewards/rejected": 4.492226600646973, "step": 7098 }, { "epoch": 1.57, "learning_rate": 1.1576443216247828e-06, "logits/chosen": -1.947173833847046, "logits/rejected": -1.5405491590499878, "logps/chosen": -29.504608154296875, "logps/rejected": -46.91943359375, "loss": 0.4632, "rewards/accuracies": 1.0, "rewards/chosen": 5.070048809051514, "rewards/margins": 0.1741485595703125, "rewards/rejected": 4.895900249481201, "step": 7099 }, { "epoch": 1.57, "learning_rate": 1.15649768564423e-06, "logits/chosen": -1.8237195014953613, "logits/rejected": -1.8365504741668701, "logps/chosen": -97.41201782226562, "logps/rejected": -81.52543640136719, "loss": 0.5137, "rewards/accuracies": 0.0, "rewards/chosen": 8.7074556350708, "rewards/margins": -0.5400104522705078, "rewards/rejected": 9.247466087341309, "step": 7100 }, { "epoch": 1.57, "learning_rate": 1.1553515435444834e-06, "logits/chosen": -1.9980361461639404, "logits/rejected": -1.8905866146087646, "logps/chosen": -79.41304016113281, "logps/rejected": -60.051883697509766, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 6.9803361892700195, "rewards/margins": 4.8134765625, "rewards/rejected": 2.1668598651885986, "step": 7101 }, { "epoch": 1.57, "learning_rate": 1.1542058954728208e-06, "logits/chosen": -1.8321434259414673, "logits/rejected": -1.8321434259414673, "logps/chosen": -89.2863540649414, "logps/rejected": -89.2863540649414, "loss": 0.3525, "rewards/accuracies": 0.0, "rewards/chosen": 4.79565954208374, "rewards/margins": 0.0, "rewards/rejected": 4.79565954208374, "step": 7102 }, { "epoch": 1.57, "learning_rate": 1.1530607415764561e-06, "logits/chosen": -1.9260921478271484, "logits/rejected": -1.9257361888885498, "logps/chosen": -105.86309814453125, "logps/rejected": -100.81793212890625, "loss": 0.0872, "rewards/accuracies": 1.0, "rewards/chosen": 8.837651252746582, "rewards/margins": 1.7843170166015625, "rewards/rejected": 7.0533342361450195, "step": 7103 }, { "epoch": 1.57, "learning_rate": 1.1519160820025382e-06, "logits/chosen": -2.062098741531372, "logits/rejected": -1.9207686185836792, "logps/chosen": -121.76488494873047, "logps/rejected": -48.743370056152344, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 9.134708404541016, "rewards/margins": 5.4514875411987305, "rewards/rejected": 3.683220624923706, "step": 7104 }, { "epoch": 1.57, "learning_rate": 1.1507719168981535e-06, "logits/chosen": -2.0200371742248535, "logits/rejected": -1.9652516841888428, "logps/chosen": -111.24039459228516, "logps/rejected": -116.98797607421875, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": 7.129435062408447, "rewards/margins": 3.5966591835021973, "rewards/rejected": 3.53277587890625, "step": 7105 }, { "epoch": 1.57, "learning_rate": 1.1496282464103237e-06, "logits/chosen": -1.9405944347381592, "logits/rejected": -1.9405944347381592, "logps/chosen": -32.28834915161133, "logps/rejected": -32.28834915161133, "loss": 0.3874, "rewards/accuracies": 0.0, "rewards/chosen": 3.4687085151672363, "rewards/margins": 0.0, "rewards/rejected": 3.4687085151672363, "step": 7106 }, { "epoch": 1.57, "learning_rate": 1.1484850706860096e-06, "logits/chosen": -1.8611390590667725, "logits/rejected": -1.8611390590667725, "logps/chosen": -20.863054275512695, "logps/rejected": -20.863054275512695, "loss": 0.3618, "rewards/accuracies": 0.0, "rewards/chosen": 2.144085168838501, "rewards/margins": 0.0, "rewards/rejected": 2.144085168838501, "step": 7107 }, { "epoch": 1.57, "learning_rate": 1.1473423898721025e-06, "logits/chosen": -1.7113467454910278, "logits/rejected": -1.6474305391311646, "logps/chosen": -42.927894592285156, "logps/rejected": -76.37742614746094, "loss": 0.2713, "rewards/accuracies": 1.0, "rewards/chosen": 4.647219181060791, "rewards/margins": 1.7422935962677002, "rewards/rejected": 2.904925584793091, "step": 7108 }, { "epoch": 1.57, "learning_rate": 1.1462002041154396e-06, "logits/chosen": -1.9894566535949707, "logits/rejected": -1.9801459312438965, "logps/chosen": -102.42758178710938, "logps/rejected": -126.06317138671875, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 9.837786674499512, "rewards/margins": 3.0009074211120605, "rewards/rejected": 6.836879253387451, "step": 7109 }, { "epoch": 1.57, "learning_rate": 1.145058513562785e-06, "logits/chosen": -1.8291634321212769, "logits/rejected": -1.7454553842544556, "logps/chosen": -50.53557586669922, "logps/rejected": -45.570926666259766, "loss": 0.4966, "rewards/accuracies": 0.0, "rewards/chosen": 2.7949492931365967, "rewards/margins": -0.5250606536865234, "rewards/rejected": 3.32000994682312, "step": 7110 }, { "epoch": 1.57, "learning_rate": 1.1439173183608444e-06, "logits/chosen": -1.8918988704681396, "logits/rejected": -1.9371577501296997, "logps/chosen": -89.893310546875, "logps/rejected": -112.813232421875, "loss": 0.1037, "rewards/accuracies": 1.0, "rewards/chosen": 9.680809020996094, "rewards/margins": 1.4920930862426758, "rewards/rejected": 8.188715934753418, "step": 7111 }, { "epoch": 1.57, "learning_rate": 1.1427766186562582e-06, "logits/chosen": -1.9775569438934326, "logits/rejected": -1.960414171218872, "logps/chosen": -82.40303039550781, "logps/rejected": -58.85771942138672, "loss": 0.1397, "rewards/accuracies": 1.0, "rewards/chosen": 5.700250148773193, "rewards/margins": 1.228560447692871, "rewards/rejected": 4.471689701080322, "step": 7112 }, { "epoch": 1.57, "learning_rate": 1.1416364145956039e-06, "logits/chosen": -2.1075782775878906, "logits/rejected": -2.095625400543213, "logps/chosen": -59.724761962890625, "logps/rejected": -84.97880554199219, "loss": 0.4059, "rewards/accuracies": 0.0, "rewards/chosen": 3.882786512374878, "rewards/margins": -0.15772414207458496, "rewards/rejected": 4.040510654449463, "step": 7113 }, { "epoch": 1.57, "learning_rate": 1.1404967063253958e-06, "logits/chosen": -1.8035204410552979, "logits/rejected": -1.7756214141845703, "logps/chosen": -40.56205749511719, "logps/rejected": -59.59449005126953, "loss": 0.3986, "rewards/accuracies": 1.0, "rewards/chosen": 3.6483216285705566, "rewards/margins": 0.7593071460723877, "rewards/rejected": 2.889014482498169, "step": 7114 }, { "epoch": 1.57, "learning_rate": 1.1393574939920822e-06, "logits/chosen": -2.1099023818969727, "logits/rejected": -2.042056083679199, "logps/chosen": -65.57271575927734, "logps/rejected": -126.59354400634766, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": 10.731623649597168, "rewards/margins": 4.187248706817627, "rewards/rejected": 6.544374942779541, "step": 7115 }, { "epoch": 1.58, "learning_rate": 1.1382187777420517e-06, "logits/chosen": -1.9902448654174805, "logits/rejected": -1.9541735649108887, "logps/chosen": -106.07450866699219, "logps/rejected": -80.47481536865234, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 6.242076396942139, "rewards/margins": 2.6801393032073975, "rewards/rejected": 3.561937093734741, "step": 7116 }, { "epoch": 1.58, "learning_rate": 1.137080557721622e-06, "logits/chosen": -1.8608367443084717, "logits/rejected": -1.7926839590072632, "logps/chosen": -113.70986938476562, "logps/rejected": -60.175537109375, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 6.644140720367432, "rewards/margins": 4.150940895080566, "rewards/rejected": 2.4932000637054443, "step": 7117 }, { "epoch": 1.58, "learning_rate": 1.1359428340770567e-06, "logits/chosen": -1.8743668794631958, "logits/rejected": -1.8743668794631958, "logps/chosen": -3.7196249961853027, "logps/rejected": -3.7196249961853027, "loss": 1.99, "rewards/accuracies": 0.0, "rewards/chosen": 1.2866111993789673, "rewards/margins": 0.0, "rewards/rejected": 1.2866111993789673, "step": 7118 }, { "epoch": 1.58, "learning_rate": 1.134805606954546e-06, "logits/chosen": -1.8954949378967285, "logits/rejected": -1.8052929639816284, "logps/chosen": -84.03059387207031, "logps/rejected": -63.39707946777344, "loss": 0.2399, "rewards/accuracies": 1.0, "rewards/chosen": 4.659739971160889, "rewards/margins": 0.5029067993164062, "rewards/rejected": 4.156833171844482, "step": 7119 }, { "epoch": 1.58, "learning_rate": 1.1336688765002263e-06, "logits/chosen": -2.0937023162841797, "logits/rejected": -1.8879318237304688, "logps/chosen": -122.58384704589844, "logps/rejected": -80.94405364990234, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 5.429985046386719, "rewards/margins": 7.025465965270996, "rewards/rejected": -1.5954811573028564, "step": 7120 }, { "epoch": 1.58, "learning_rate": 1.1325326428601596e-06, "logits/chosen": -2.224794626235962, "logits/rejected": -2.0731053352355957, "logps/chosen": -110.10599517822266, "logps/rejected": -19.644453048706055, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": 8.373983383178711, "rewards/margins": 6.8355512619018555, "rewards/rejected": 1.5384321212768555, "step": 7121 }, { "epoch": 1.58, "learning_rate": 1.131396906180351e-06, "logits/chosen": -1.7412223815917969, "logits/rejected": -1.7235909700393677, "logps/chosen": -55.22105407714844, "logps/rejected": -72.6016616821289, "loss": 1.1223, "rewards/accuracies": 0.0, "rewards/chosen": 3.7175049781799316, "rewards/margins": -0.036623239517211914, "rewards/rejected": 3.7541282176971436, "step": 7122 }, { "epoch": 1.58, "learning_rate": 1.1302616666067406e-06, "logits/chosen": -2.012885808944702, "logits/rejected": -1.9346174001693726, "logps/chosen": -63.85232925415039, "logps/rejected": -43.82228088378906, "loss": 0.1292, "rewards/accuracies": 1.0, "rewards/chosen": 5.7079291343688965, "rewards/margins": 2.1793570518493652, "rewards/rejected": 3.5285720825195312, "step": 7123 }, { "epoch": 1.58, "learning_rate": 1.1291269242852032e-06, "logits/chosen": -1.5612629652023315, "logits/rejected": -1.5718200206756592, "logps/chosen": -34.55893325805664, "logps/rejected": -44.619659423828125, "loss": 0.3086, "rewards/accuracies": 1.0, "rewards/chosen": 2.1667168140411377, "rewards/margins": 0.4966259002685547, "rewards/rejected": 1.670090913772583, "step": 7124 }, { "epoch": 1.58, "learning_rate": 1.127992679361552e-06, "logits/chosen": -1.8653978109359741, "logits/rejected": -1.8650075197219849, "logps/chosen": -65.50892639160156, "logps/rejected": -70.52660369873047, "loss": 0.3415, "rewards/accuracies": 1.0, "rewards/chosen": 5.838830471038818, "rewards/margins": 0.03569221496582031, "rewards/rejected": 5.803138256072998, "step": 7125 }, { "epoch": 1.58, "learning_rate": 1.126858931981531e-06, "logits/chosen": -2.1412482261657715, "logits/rejected": -2.0702171325683594, "logps/chosen": -34.340641021728516, "logps/rejected": -18.301666259765625, "loss": 0.1361, "rewards/accuracies": 1.0, "rewards/chosen": 2.5792999267578125, "rewards/margins": 2.21461820602417, "rewards/rejected": 0.3646816313266754, "step": 7126 }, { "epoch": 1.58, "learning_rate": 1.1257256822908286e-06, "logits/chosen": -2.024162530899048, "logits/rejected": -2.026693820953369, "logps/chosen": -57.274471282958984, "logps/rejected": -89.79763793945312, "loss": 0.1019, "rewards/accuracies": 1.0, "rewards/chosen": 8.386395454406738, "rewards/margins": 1.9236512184143066, "rewards/rejected": 6.462744235992432, "step": 7127 }, { "epoch": 1.58, "learning_rate": 1.1245929304350607e-06, "logits/chosen": -1.7899806499481201, "logits/rejected": -1.775010347366333, "logps/chosen": -62.8914680480957, "logps/rejected": -37.63685607910156, "loss": 0.9079, "rewards/accuracies": 1.0, "rewards/chosen": 3.14764666557312, "rewards/margins": 0.08702945709228516, "rewards/rejected": 3.060617208480835, "step": 7128 }, { "epoch": 1.58, "learning_rate": 1.1234606765597871e-06, "logits/chosen": -2.101783514022827, "logits/rejected": -2.108964443206787, "logps/chosen": -64.384033203125, "logps/rejected": -119.2470703125, "loss": 0.4953, "rewards/accuracies": 0.0, "rewards/chosen": 4.334147930145264, "rewards/margins": -0.5196118354797363, "rewards/rejected": 4.853759765625, "step": 7129 }, { "epoch": 1.58, "learning_rate": 1.1223289208104965e-06, "logits/chosen": -2.095085620880127, "logits/rejected": -2.089423179626465, "logps/chosen": -34.28936767578125, "logps/rejected": -28.19886589050293, "loss": 0.1973, "rewards/accuracies": 1.0, "rewards/chosen": 4.1787238121032715, "rewards/margins": 1.2368545532226562, "rewards/rejected": 2.9418692588806152, "step": 7130 }, { "epoch": 1.58, "learning_rate": 1.1211976633326176e-06, "logits/chosen": -2.0537378787994385, "logits/rejected": -1.93473219871521, "logps/chosen": -81.70355224609375, "logps/rejected": -32.6817626953125, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 8.189247131347656, "rewards/margins": 4.89669132232666, "rewards/rejected": 3.292556047439575, "step": 7131 }, { "epoch": 1.58, "learning_rate": 1.1200669042715163e-06, "logits/chosen": -1.941269040107727, "logits/rejected": -1.7173317670822144, "logps/chosen": -84.36637115478516, "logps/rejected": -72.44922637939453, "loss": 0.6419, "rewards/accuracies": 1.0, "rewards/chosen": 5.214409828186035, "rewards/margins": 3.773280620574951, "rewards/rejected": 1.4411293268203735, "step": 7132 }, { "epoch": 1.58, "learning_rate": 1.1189366437724875e-06, "logits/chosen": -1.9227453470230103, "logits/rejected": -1.9657928943634033, "logps/chosen": -40.57099151611328, "logps/rejected": -79.99042510986328, "loss": 0.7437, "rewards/accuracies": 0.0, "rewards/chosen": 5.288865089416504, "rewards/margins": -1.086015224456787, "rewards/rejected": 6.374880313873291, "step": 7133 }, { "epoch": 1.58, "learning_rate": 1.1178068819807735e-06, "logits/chosen": -1.9628716707229614, "logits/rejected": -1.9628716707229614, "logps/chosen": -53.20148468017578, "logps/rejected": -53.20148468017578, "loss": 0.3939, "rewards/accuracies": 0.0, "rewards/chosen": 5.229488372802734, "rewards/margins": 0.0, "rewards/rejected": 5.229488372802734, "step": 7134 }, { "epoch": 1.58, "learning_rate": 1.1166776190415396e-06, "logits/chosen": -2.1519973278045654, "logits/rejected": -2.116384983062744, "logps/chosen": -44.992679595947266, "logps/rejected": -39.52401351928711, "loss": 0.4264, "rewards/accuracies": 0.0, "rewards/chosen": 3.4718143939971924, "rewards/margins": -0.2802748680114746, "rewards/rejected": 3.752089262008667, "step": 7135 }, { "epoch": 1.58, "learning_rate": 1.1155488550998989e-06, "logits/chosen": -2.195389747619629, "logits/rejected": -2.179529905319214, "logps/chosen": -73.73348999023438, "logps/rejected": -62.236263275146484, "loss": 0.2042, "rewards/accuracies": 1.0, "rewards/chosen": 6.121350288391113, "rewards/margins": 0.7250819206237793, "rewards/rejected": 5.396268367767334, "step": 7136 }, { "epoch": 1.58, "learning_rate": 1.1144205903008898e-06, "logits/chosen": -2.1695539951324463, "logits/rejected": -2.1821372509002686, "logps/chosen": -81.25309753417969, "logps/rejected": -133.65737915039062, "loss": 0.1449, "rewards/accuracies": 1.0, "rewards/chosen": 7.309321880340576, "rewards/margins": 1.1797747611999512, "rewards/rejected": 6.129547119140625, "step": 7137 }, { "epoch": 1.58, "learning_rate": 1.1132928247894976e-06, "logits/chosen": -2.0873336791992188, "logits/rejected": -2.0545051097869873, "logps/chosen": -44.96644973754883, "logps/rejected": -58.59396743774414, "loss": 0.2934, "rewards/accuracies": 1.0, "rewards/chosen": 3.6828808784484863, "rewards/margins": 2.239956855773926, "rewards/rejected": 1.44292414188385, "step": 7138 }, { "epoch": 1.58, "learning_rate": 1.1121655587106318e-06, "logits/chosen": -1.7521709203720093, "logits/rejected": -1.661920428276062, "logps/chosen": -31.913429260253906, "logps/rejected": -10.045434951782227, "loss": 0.4854, "rewards/accuracies": 1.0, "rewards/chosen": 2.126455783843994, "rewards/margins": 1.1393117904663086, "rewards/rejected": 0.9871439337730408, "step": 7139 }, { "epoch": 1.58, "learning_rate": 1.111038792209146e-06, "logits/chosen": -2.2598791122436523, "logits/rejected": -2.2027475833892822, "logps/chosen": -58.77618408203125, "logps/rejected": -29.91836166381836, "loss": 0.1433, "rewards/accuracies": 1.0, "rewards/chosen": 4.8502349853515625, "rewards/margins": 1.7576944828033447, "rewards/rejected": 3.0925405025482178, "step": 7140 }, { "epoch": 1.58, "learning_rate": 1.1099125254298277e-06, "logits/chosen": -1.9900699853897095, "logits/rejected": -1.9424986839294434, "logps/chosen": -87.5232925415039, "logps/rejected": -15.02903938293457, "loss": 0.1748, "rewards/accuracies": 1.0, "rewards/chosen": 3.3874688148498535, "rewards/margins": 2.2194244861602783, "rewards/rejected": 1.1680443286895752, "step": 7141 }, { "epoch": 1.58, "learning_rate": 1.1087867585173956e-06, "logits/chosen": -1.8464083671569824, "logits/rejected": -1.8036057949066162, "logps/chosen": -67.22659301757812, "logps/rejected": -71.12640380859375, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 7.6489458084106445, "rewards/margins": 1.735936164855957, "rewards/rejected": 5.9130096435546875, "step": 7142 }, { "epoch": 1.58, "learning_rate": 1.107661491616514e-06, "logits/chosen": -1.7156718969345093, "logits/rejected": -1.7045668363571167, "logps/chosen": -30.035633087158203, "logps/rejected": -58.641273498535156, "loss": 1.5967, "rewards/accuracies": 1.0, "rewards/chosen": 3.55344820022583, "rewards/margins": 1.6200512647628784, "rewards/rejected": 1.9333969354629517, "step": 7143 }, { "epoch": 1.58, "learning_rate": 1.1065367248717707e-06, "logits/chosen": -1.701202630996704, "logits/rejected": -1.7143654823303223, "logps/chosen": -50.01420593261719, "logps/rejected": -158.05545043945312, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": 3.7818291187286377, "rewards/margins": 2.368450164794922, "rewards/rejected": 1.4133789539337158, "step": 7144 }, { "epoch": 1.58, "learning_rate": 1.1054124584277016e-06, "logits/chosen": -1.860281229019165, "logits/rejected": -1.8743822574615479, "logps/chosen": -22.004322052001953, "logps/rejected": -21.87690544128418, "loss": 0.2081, "rewards/accuracies": 1.0, "rewards/chosen": 3.0303685665130615, "rewards/margins": 0.6620445251464844, "rewards/rejected": 2.368324041366577, "step": 7145 }, { "epoch": 1.58, "learning_rate": 1.104288692428766e-06, "logits/chosen": -1.8680421113967896, "logits/rejected": -1.8976373672485352, "logps/chosen": -40.45357894897461, "logps/rejected": -105.36640930175781, "loss": 1.1326, "rewards/accuracies": 0.0, "rewards/chosen": 3.54778790473938, "rewards/margins": -1.3393504619598389, "rewards/rejected": 4.887138366699219, "step": 7146 }, { "epoch": 1.58, "learning_rate": 1.1031654270193714e-06, "logits/chosen": -1.831447958946228, "logits/rejected": -1.7636457681655884, "logps/chosen": -149.2373504638672, "logps/rejected": -107.664306640625, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": 7.816459655761719, "rewards/margins": 3.743849277496338, "rewards/rejected": 4.072610378265381, "step": 7147 }, { "epoch": 1.58, "learning_rate": 1.1020426623438507e-06, "logits/chosen": -2.2216877937316895, "logits/rejected": -2.1826987266540527, "logps/chosen": -56.42371368408203, "logps/rejected": -18.793357849121094, "loss": 0.3767, "rewards/accuracies": 1.0, "rewards/chosen": 2.8233025074005127, "rewards/margins": 2.4285261631011963, "rewards/rejected": 0.3947763442993164, "step": 7148 }, { "epoch": 1.58, "learning_rate": 1.1009203985464773e-06, "logits/chosen": -2.0698511600494385, "logits/rejected": -1.8676124811172485, "logps/chosen": -102.7828140258789, "logps/rejected": -24.832019805908203, "loss": 0.0901, "rewards/accuracies": 1.0, "rewards/chosen": 9.3804349899292, "rewards/margins": 9.178544044494629, "rewards/rejected": 0.20189133286476135, "step": 7149 }, { "epoch": 1.58, "learning_rate": 1.099798635771459e-06, "logits/chosen": -2.489405632019043, "logits/rejected": -2.5356061458587646, "logps/chosen": -110.30651092529297, "logps/rejected": -94.61784362792969, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": 10.525115013122559, "rewards/margins": 3.4216341972351074, "rewards/rejected": 7.103480815887451, "step": 7150 }, { "epoch": 1.58, "learning_rate": 1.0986773741629408e-06, "logits/chosen": -2.1258599758148193, "logits/rejected": -2.0963666439056396, "logps/chosen": -138.70651245117188, "logps/rejected": -59.637657165527344, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": 5.891971111297607, "rewards/margins": 3.1910951137542725, "rewards/rejected": 2.700875997543335, "step": 7151 }, { "epoch": 1.58, "learning_rate": 1.097556613865003e-06, "logits/chosen": -1.900080680847168, "logits/rejected": -1.9091267585754395, "logps/chosen": -34.268096923828125, "logps/rejected": -26.591157913208008, "loss": 0.2731, "rewards/accuracies": 1.0, "rewards/chosen": 2.412012815475464, "rewards/margins": 0.6707547903060913, "rewards/rejected": 1.7412580251693726, "step": 7152 }, { "epoch": 1.58, "learning_rate": 1.0964363550216555e-06, "logits/chosen": -1.9904718399047852, "logits/rejected": -1.9412238597869873, "logps/chosen": -44.729774475097656, "logps/rejected": -23.218639373779297, "loss": 0.1072, "rewards/accuracies": 1.0, "rewards/chosen": 2.8920631408691406, "rewards/margins": 2.4649300575256348, "rewards/rejected": 0.4271329939365387, "step": 7153 }, { "epoch": 1.58, "learning_rate": 1.095316597776856e-06, "logits/chosen": -2.2250936031341553, "logits/rejected": -2.2508950233459473, "logps/chosen": -64.84195709228516, "logps/rejected": -76.39100646972656, "loss": 2.6493, "rewards/accuracies": 0.0, "rewards/chosen": 2.8850998878479004, "rewards/margins": -2.593801975250244, "rewards/rejected": 5.4789018630981445, "step": 7154 }, { "epoch": 1.58, "learning_rate": 1.0941973422744856e-06, "logits/chosen": -1.9860966205596924, "logits/rejected": -1.9752131700515747, "logps/chosen": -48.52477264404297, "logps/rejected": -45.217041015625, "loss": 0.2034, "rewards/accuracies": 1.0, "rewards/chosen": 3.8078911304473877, "rewards/margins": 0.9700157642364502, "rewards/rejected": 2.8378753662109375, "step": 7155 }, { "epoch": 1.58, "learning_rate": 1.0930785886583672e-06, "logits/chosen": -2.1296961307525635, "logits/rejected": -2.1422970294952393, "logps/chosen": -8.976301193237305, "logps/rejected": -17.924224853515625, "loss": 0.5723, "rewards/accuracies": 0.0, "rewards/chosen": 1.042885422706604, "rewards/margins": -0.6484895944595337, "rewards/rejected": 1.6913750171661377, "step": 7156 }, { "epoch": 1.58, "learning_rate": 1.0919603370722582e-06, "logits/chosen": -1.666197419166565, "logits/rejected": -1.6331819295883179, "logps/chosen": -47.25547790527344, "logps/rejected": -72.24295806884766, "loss": 0.5804, "rewards/accuracies": 0.0, "rewards/chosen": 3.063053846359253, "rewards/margins": -0.7552042007446289, "rewards/rejected": 3.818258047103882, "step": 7157 }, { "epoch": 1.58, "learning_rate": 1.0908425876598512e-06, "logits/chosen": -1.858530879020691, "logits/rejected": -1.8700549602508545, "logps/chosen": -48.714515686035156, "logps/rejected": -71.00518798828125, "loss": 1.2371, "rewards/accuracies": 0.0, "rewards/chosen": 3.36750864982605, "rewards/margins": -0.8445937633514404, "rewards/rejected": 4.21210241317749, "step": 7158 }, { "epoch": 1.58, "learning_rate": 1.0897253405647746e-06, "logits/chosen": -1.9262986183166504, "logits/rejected": -1.778931975364685, "logps/chosen": -210.51092529296875, "logps/rejected": -40.319679260253906, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 7.206045627593994, "rewards/margins": 3.7425782680511475, "rewards/rejected": 3.4634673595428467, "step": 7159 }, { "epoch": 1.58, "learning_rate": 1.0886085959305915e-06, "logits/chosen": -2.074037790298462, "logits/rejected": -2.022865056991577, "logps/chosen": -103.35874938964844, "logps/rejected": -68.40127563476562, "loss": 0.0671, "rewards/accuracies": 1.0, "rewards/chosen": 8.957969665527344, "rewards/margins": 2.141439914703369, "rewards/rejected": 6.816529750823975, "step": 7160 }, { "epoch": 1.58, "learning_rate": 1.0874923539008025e-06, "logits/chosen": -1.8460687398910522, "logits/rejected": -1.8360365629196167, "logps/chosen": -94.86639404296875, "logps/rejected": -53.112892150878906, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 10.592883110046387, "rewards/margins": 5.87388801574707, "rewards/rejected": 4.718995094299316, "step": 7161 }, { "epoch": 1.59, "learning_rate": 1.0863766146188382e-06, "logits/chosen": -1.9579637050628662, "logits/rejected": -1.829366683959961, "logps/chosen": -67.48472595214844, "logps/rejected": -15.771526336669922, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": 6.2919769287109375, "rewards/margins": 4.673203945159912, "rewards/rejected": 1.6187728643417358, "step": 7162 }, { "epoch": 1.59, "learning_rate": 1.085261378228073e-06, "logits/chosen": -1.9466335773468018, "logits/rejected": -2.0183310508728027, "logps/chosen": -32.34196090698242, "logps/rejected": -70.72471618652344, "loss": 1.7053, "rewards/accuracies": 0.0, "rewards/chosen": 4.899040699005127, "rewards/margins": -3.345092296600342, "rewards/rejected": 8.244132995605469, "step": 7163 }, { "epoch": 1.59, "learning_rate": 1.0841466448718086e-06, "logits/chosen": -1.9453614950180054, "logits/rejected": -1.9006695747375488, "logps/chosen": -156.21299743652344, "logps/rejected": -122.66864013671875, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 6.90859842300415, "rewards/margins": 3.6565628051757812, "rewards/rejected": 3.252035617828369, "step": 7164 }, { "epoch": 1.59, "learning_rate": 1.0830324146932875e-06, "logits/chosen": -2.1362977027893066, "logits/rejected": -2.119321346282959, "logps/chosen": -22.371976852416992, "logps/rejected": -36.56957244873047, "loss": 0.1401, "rewards/accuracies": 1.0, "rewards/chosen": 3.1357362270355225, "rewards/margins": 1.5168267488479614, "rewards/rejected": 1.618909478187561, "step": 7165 }, { "epoch": 1.59, "learning_rate": 1.081918687835684e-06, "logits/chosen": -1.8234882354736328, "logits/rejected": -1.830717921257019, "logps/chosen": -31.440624237060547, "logps/rejected": -75.7474594116211, "loss": 0.5421, "rewards/accuracies": 0.0, "rewards/chosen": 3.300842761993408, "rewards/margins": -0.5876133441925049, "rewards/rejected": 3.888456106185913, "step": 7166 }, { "epoch": 1.59, "learning_rate": 1.0808054644421112e-06, "logits/chosen": -2.086703300476074, "logits/rejected": -2.084613561630249, "logps/chosen": -64.67642211914062, "logps/rejected": -63.44905090332031, "loss": 0.288, "rewards/accuracies": 1.0, "rewards/chosen": 4.811298370361328, "rewards/margins": 0.6077637672424316, "rewards/rejected": 4.2035346031188965, "step": 7167 }, { "epoch": 1.59, "learning_rate": 1.0796927446556143e-06, "logits/chosen": -1.884902000427246, "logits/rejected": -1.863121509552002, "logps/chosen": -54.251731872558594, "logps/rejected": -54.60157012939453, "loss": 0.4116, "rewards/accuracies": 1.0, "rewards/chosen": 3.5977699756622314, "rewards/margins": 1.090611219406128, "rewards/rejected": 2.5071587562561035, "step": 7168 }, { "epoch": 1.59, "learning_rate": 1.0785805286191758e-06, "logits/chosen": -1.831818699836731, "logits/rejected": -1.6281208992004395, "logps/chosen": -86.02166748046875, "logps/rejected": -10.701685905456543, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": 5.488111972808838, "rewards/margins": 1.6639256477355957, "rewards/rejected": 3.824186325073242, "step": 7169 }, { "epoch": 1.59, "learning_rate": 1.0774688164757124e-06, "logits/chosen": -2.003424882888794, "logits/rejected": -1.9983714818954468, "logps/chosen": -27.554359436035156, "logps/rejected": -29.48525047302246, "loss": 0.3022, "rewards/accuracies": 1.0, "rewards/chosen": 1.9911037683486938, "rewards/margins": 0.3784482479095459, "rewards/rejected": 1.612655520439148, "step": 7170 }, { "epoch": 1.59, "learning_rate": 1.0763576083680766e-06, "logits/chosen": -1.68929123878479, "logits/rejected": -1.672301173210144, "logps/chosen": -58.2933235168457, "logps/rejected": -41.753883361816406, "loss": 0.2228, "rewards/accuracies": 1.0, "rewards/chosen": 7.450892925262451, "rewards/margins": 4.188241958618164, "rewards/rejected": 3.262651205062866, "step": 7171 }, { "epoch": 1.59, "learning_rate": 1.0752469044390574e-06, "logits/chosen": -1.793053150177002, "logits/rejected": -1.7766066789627075, "logps/chosen": -41.81504440307617, "logps/rejected": -77.39910888671875, "loss": 0.1252, "rewards/accuracies": 1.0, "rewards/chosen": 4.423416614532471, "rewards/margins": 1.4169254302978516, "rewards/rejected": 3.006491184234619, "step": 7172 }, { "epoch": 1.59, "learning_rate": 1.0741367048313744e-06, "logits/chosen": -2.044823408126831, "logits/rejected": -2.0038046836853027, "logps/chosen": -162.905517578125, "logps/rejected": -108.94949340820312, "loss": 0.0778, "rewards/accuracies": 1.0, "rewards/chosen": 4.872251987457275, "rewards/margins": 2.429577589035034, "rewards/rejected": 2.442674398422241, "step": 7173 }, { "epoch": 1.59, "learning_rate": 1.0730270096876876e-06, "logits/chosen": -1.8601678609848022, "logits/rejected": -1.8144629001617432, "logps/chosen": -49.59899139404297, "logps/rejected": -49.58894729614258, "loss": 0.1178, "rewards/accuracies": 1.0, "rewards/chosen": 4.035293102264404, "rewards/margins": 2.344167947769165, "rewards/rejected": 1.6911251544952393, "step": 7174 }, { "epoch": 1.59, "learning_rate": 1.0719178191505898e-06, "logits/chosen": -2.115712881088257, "logits/rejected": -1.997167706489563, "logps/chosen": -130.04376220703125, "logps/rejected": -20.310543060302734, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 8.670068740844727, "rewards/margins": 4.602464199066162, "rewards/rejected": 4.0676045417785645, "step": 7175 }, { "epoch": 1.59, "learning_rate": 1.0708091333626097e-06, "logits/chosen": -2.1036243438720703, "logits/rejected": -1.9351483583450317, "logps/chosen": -76.85197448730469, "logps/rejected": -151.56988525390625, "loss": 0.0564, "rewards/accuracies": 1.0, "rewards/chosen": 5.77516508102417, "rewards/margins": 2.260425090789795, "rewards/rejected": 3.514739990234375, "step": 7176 }, { "epoch": 1.59, "learning_rate": 1.06970095246621e-06, "logits/chosen": -1.8747056722640991, "logits/rejected": -1.8433847427368164, "logps/chosen": -66.80119323730469, "logps/rejected": -41.4133186340332, "loss": 0.2995, "rewards/accuracies": 1.0, "rewards/chosen": 3.9957504272460938, "rewards/margins": 0.4156513214111328, "rewards/rejected": 3.580099105834961, "step": 7177 }, { "epoch": 1.59, "learning_rate": 1.0685932766037898e-06, "logits/chosen": -1.658678650856018, "logits/rejected": -1.539626121520996, "logps/chosen": -51.078765869140625, "logps/rejected": -54.196067810058594, "loss": 0.2841, "rewards/accuracies": 1.0, "rewards/chosen": 3.678330183029175, "rewards/margins": 0.5033750534057617, "rewards/rejected": 3.174955129623413, "step": 7178 }, { "epoch": 1.59, "learning_rate": 1.0674861059176827e-06, "logits/chosen": -2.1424341201782227, "logits/rejected": -2.107961893081665, "logps/chosen": -100.4345703125, "logps/rejected": -97.09169006347656, "loss": 0.1759, "rewards/accuracies": 1.0, "rewards/chosen": 6.3396759033203125, "rewards/margins": 0.9905533790588379, "rewards/rejected": 5.349122524261475, "step": 7179 }, { "epoch": 1.59, "learning_rate": 1.066379440550158e-06, "logits/chosen": -2.0327789783477783, "logits/rejected": -1.910868763923645, "logps/chosen": -91.99981689453125, "logps/rejected": -212.05636596679688, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": 12.17377758026123, "rewards/margins": 2.1422958374023438, "rewards/rejected": 10.031481742858887, "step": 7180 }, { "epoch": 1.59, "learning_rate": 1.0652732806434186e-06, "logits/chosen": -1.9542977809906006, "logits/rejected": -1.9058914184570312, "logps/chosen": -49.224769592285156, "logps/rejected": -43.18938446044922, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 5.78020715713501, "rewards/margins": 3.843742847442627, "rewards/rejected": 1.9364643096923828, "step": 7181 }, { "epoch": 1.59, "learning_rate": 1.064167626339605e-06, "logits/chosen": -1.9146267175674438, "logits/rejected": -1.928431510925293, "logps/chosen": -53.60137939453125, "logps/rejected": -57.865779876708984, "loss": 0.4023, "rewards/accuracies": 0.0, "rewards/chosen": 2.397343397140503, "rewards/margins": -0.21140265464782715, "rewards/rejected": 2.60874605178833, "step": 7182 }, { "epoch": 1.59, "learning_rate": 1.063062477780789e-06, "logits/chosen": -1.7338327169418335, "logits/rejected": -1.6783838272094727, "logps/chosen": -64.49726104736328, "logps/rejected": -26.215002059936523, "loss": 1.7315, "rewards/accuracies": 1.0, "rewards/chosen": 3.7716362476348877, "rewards/margins": 1.3871006965637207, "rewards/rejected": 2.384535551071167, "step": 7183 }, { "epoch": 1.59, "learning_rate": 1.0619578351089805e-06, "logits/chosen": -1.9235047101974487, "logits/rejected": -1.928899884223938, "logps/chosen": -78.04867553710938, "logps/rejected": -52.67113494873047, "loss": 0.5126, "rewards/accuracies": 1.0, "rewards/chosen": 4.686871528625488, "rewards/margins": 1.2695016860961914, "rewards/rejected": 3.417369842529297, "step": 7184 }, { "epoch": 1.59, "learning_rate": 1.0608536984661227e-06, "logits/chosen": -1.8640695810317993, "logits/rejected": -1.8187419176101685, "logps/chosen": -45.97069549560547, "logps/rejected": -53.04277038574219, "loss": 0.9916, "rewards/accuracies": 0.0, "rewards/chosen": 3.3429718017578125, "rewards/margins": -1.3898615837097168, "rewards/rejected": 4.732833385467529, "step": 7185 }, { "epoch": 1.59, "learning_rate": 1.059750067994096e-06, "logits/chosen": -1.7752351760864258, "logits/rejected": -1.7638047933578491, "logps/chosen": -48.49857711791992, "logps/rejected": -54.37470245361328, "loss": 0.0684, "rewards/accuracies": 1.0, "rewards/chosen": 4.5410237312316895, "rewards/margins": 2.309760808944702, "rewards/rejected": 2.2312629222869873, "step": 7186 }, { "epoch": 1.59, "learning_rate": 1.058646943834713e-06, "logits/chosen": -1.8618422746658325, "logits/rejected": -1.8644025325775146, "logps/chosen": -81.30909729003906, "logps/rejected": -93.39295196533203, "loss": 0.3842, "rewards/accuracies": 1.0, "rewards/chosen": 5.769026279449463, "rewards/margins": 2.570234775543213, "rewards/rejected": 3.19879150390625, "step": 7187 }, { "epoch": 1.59, "learning_rate": 1.057544326129723e-06, "logits/chosen": -2.2150940895080566, "logits/rejected": -2.1534955501556396, "logps/chosen": -50.12825012207031, "logps/rejected": -23.228580474853516, "loss": 0.2013, "rewards/accuracies": 1.0, "rewards/chosen": 3.3020546436309814, "rewards/margins": 1.8511196374893188, "rewards/rejected": 1.4509350061416626, "step": 7188 }, { "epoch": 1.59, "learning_rate": 1.0564422150208098e-06, "logits/chosen": -1.7935422658920288, "logits/rejected": -1.6285139322280884, "logps/chosen": -24.851545333862305, "logps/rejected": -34.27594757080078, "loss": 0.6806, "rewards/accuracies": 1.0, "rewards/chosen": 2.4666574001312256, "rewards/margins": 0.05079960823059082, "rewards/rejected": 2.4158577919006348, "step": 7189 }, { "epoch": 1.59, "learning_rate": 1.0553406106495923e-06, "logits/chosen": -2.0518760681152344, "logits/rejected": -2.0986273288726807, "logps/chosen": -24.71002960205078, "logps/rejected": -82.39376831054688, "loss": 1.3666, "rewards/accuracies": 0.0, "rewards/chosen": 4.40908670425415, "rewards/margins": -2.580461025238037, "rewards/rejected": 6.9895477294921875, "step": 7190 }, { "epoch": 1.59, "learning_rate": 1.0542395131576244e-06, "logits/chosen": -1.9952224493026733, "logits/rejected": -2.0306599140167236, "logps/chosen": -42.52783203125, "logps/rejected": -107.02266693115234, "loss": 0.5835, "rewards/accuracies": 1.0, "rewards/chosen": 4.504494667053223, "rewards/margins": 0.18815326690673828, "rewards/rejected": 4.316341400146484, "step": 7191 }, { "epoch": 1.59, "learning_rate": 1.0531389226863926e-06, "logits/chosen": -2.1388819217681885, "logits/rejected": -2.1507558822631836, "logps/chosen": -40.963294982910156, "logps/rejected": -57.118675231933594, "loss": 0.518, "rewards/accuracies": 0.0, "rewards/chosen": 4.5370917320251465, "rewards/margins": -0.5747528076171875, "rewards/rejected": 5.111844539642334, "step": 7192 }, { "epoch": 1.59, "learning_rate": 1.0520388393773212e-06, "logits/chosen": -1.969917893409729, "logits/rejected": -1.9356937408447266, "logps/chosen": -53.689666748046875, "logps/rejected": -57.50629425048828, "loss": 0.0683, "rewards/accuracies": 1.0, "rewards/chosen": 4.572409152984619, "rewards/margins": 2.0676658153533936, "rewards/rejected": 2.5047433376312256, "step": 7193 }, { "epoch": 1.59, "learning_rate": 1.0509392633717685e-06, "logits/chosen": -1.8253930807113647, "logits/rejected": -1.7172890901565552, "logps/chosen": -122.92796325683594, "logps/rejected": -68.8792495727539, "loss": 0.0736, "rewards/accuracies": 1.0, "rewards/chosen": 5.374899387359619, "rewards/margins": 2.2506539821624756, "rewards/rejected": 3.1242454051971436, "step": 7194 }, { "epoch": 1.59, "learning_rate": 1.0498401948110264e-06, "logits/chosen": -1.8179820775985718, "logits/rejected": -1.8207677602767944, "logps/chosen": -36.33213806152344, "logps/rejected": -45.06047058105469, "loss": 0.7316, "rewards/accuracies": 1.0, "rewards/chosen": 3.213062286376953, "rewards/margins": 0.20638346672058105, "rewards/rejected": 3.006678819656372, "step": 7195 }, { "epoch": 1.59, "learning_rate": 1.0487416338363243e-06, "logits/chosen": -1.5515388250350952, "logits/rejected": -1.4773362874984741, "logps/chosen": -70.86724853515625, "logps/rejected": -59.114715576171875, "loss": 0.2894, "rewards/accuracies": 1.0, "rewards/chosen": 7.344333171844482, "rewards/margins": 5.02018928527832, "rewards/rejected": 2.324144124984741, "step": 7196 }, { "epoch": 1.59, "learning_rate": 1.0476435805888235e-06, "logits/chosen": -2.346083641052246, "logits/rejected": -2.3331470489501953, "logps/chosen": -116.1173095703125, "logps/rejected": -37.77080535888672, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 7.4847612380981445, "rewards/margins": 6.992588996887207, "rewards/rejected": 0.4921722412109375, "step": 7197 }, { "epoch": 1.59, "learning_rate": 1.0465460352096213e-06, "logits/chosen": -1.6810909509658813, "logits/rejected": -1.675959587097168, "logps/chosen": -32.35044479370117, "logps/rejected": -49.747074127197266, "loss": 0.4375, "rewards/accuracies": 0.0, "rewards/chosen": 4.26872444152832, "rewards/margins": -0.2899956703186035, "rewards/rejected": 4.558720111846924, "step": 7198 }, { "epoch": 1.59, "learning_rate": 1.0454489978397493e-06, "logits/chosen": -2.1758666038513184, "logits/rejected": -2.1613237857818604, "logps/chosen": -131.1865692138672, "logps/rejected": -61.659751892089844, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 8.609550476074219, "rewards/margins": 5.349018096923828, "rewards/rejected": 3.2605323791503906, "step": 7199 }, { "epoch": 1.59, "learning_rate": 1.0443524686201762e-06, "logits/chosen": -1.6394602060317993, "logits/rejected": -1.6255335807800293, "logps/chosen": -35.007843017578125, "logps/rejected": -38.34433364868164, "loss": 0.3138, "rewards/accuracies": 1.0, "rewards/chosen": 3.254887342453003, "rewards/margins": 0.24825477600097656, "rewards/rejected": 3.0066325664520264, "step": 7200 }, { "epoch": 1.59, "learning_rate": 1.0432564476917995e-06, "logits/chosen": -1.9266208410263062, "logits/rejected": -1.8445042371749878, "logps/chosen": -62.505348205566406, "logps/rejected": -56.964508056640625, "loss": 0.1547, "rewards/accuracies": 1.0, "rewards/chosen": 5.003829479217529, "rewards/margins": 1.0917353630065918, "rewards/rejected": 3.9120941162109375, "step": 7201 }, { "epoch": 1.59, "learning_rate": 1.0421609351954599e-06, "logits/chosen": -1.874618411064148, "logits/rejected": -1.8204725980758667, "logps/chosen": -109.34822082519531, "logps/rejected": -145.05154418945312, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 12.809605598449707, "rewards/margins": 4.837745666503906, "rewards/rejected": 7.971859931945801, "step": 7202 }, { "epoch": 1.59, "learning_rate": 1.041065931271924e-06, "logits/chosen": -1.4966026544570923, "logits/rejected": -1.48312509059906, "logps/chosen": -10.730670928955078, "logps/rejected": -5.697494983673096, "loss": 0.1813, "rewards/accuracies": 1.0, "rewards/chosen": 1.4270941019058228, "rewards/margins": 0.933610200881958, "rewards/rejected": 0.49348387122154236, "step": 7203 }, { "epoch": 1.59, "learning_rate": 1.0399714360618996e-06, "logits/chosen": -1.9507641792297363, "logits/rejected": -1.8602019548416138, "logps/chosen": -123.86470031738281, "logps/rejected": -82.69873046875, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 6.663066387176514, "rewards/margins": 3.368605375289917, "rewards/rejected": 3.2944610118865967, "step": 7204 }, { "epoch": 1.59, "learning_rate": 1.0388774497060256e-06, "logits/chosen": -2.1256392002105713, "logits/rejected": -2.0882647037506104, "logps/chosen": -105.9132080078125, "logps/rejected": -76.26653289794922, "loss": 0.2144, "rewards/accuracies": 1.0, "rewards/chosen": 7.155053615570068, "rewards/margins": 0.6360797882080078, "rewards/rejected": 6.5189738273620605, "step": 7205 }, { "epoch": 1.59, "learning_rate": 1.0377839723448774e-06, "logits/chosen": -1.9066269397735596, "logits/rejected": -1.9634747505187988, "logps/chosen": -80.86288452148438, "logps/rejected": -103.13523864746094, "loss": 0.6277, "rewards/accuracies": 0.0, "rewards/chosen": 6.29390287399292, "rewards/margins": -0.9033126831054688, "rewards/rejected": 7.197215557098389, "step": 7206 }, { "epoch": 1.6, "learning_rate": 1.0366910041189649e-06, "logits/chosen": -2.041670799255371, "logits/rejected": -1.9974342584609985, "logps/chosen": -91.98338317871094, "logps/rejected": -85.78361511230469, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": 8.476300239562988, "rewards/margins": 2.6622376441955566, "rewards/rejected": 5.814062595367432, "step": 7207 }, { "epoch": 1.6, "learning_rate": 1.035598545168728e-06, "logits/chosen": -2.0984046459198, "logits/rejected": -2.041360855102539, "logps/chosen": -89.07637023925781, "logps/rejected": -47.01458740234375, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 6.092894077301025, "rewards/margins": 2.4753243923187256, "rewards/rejected": 3.6175696849823, "step": 7208 }, { "epoch": 1.6, "learning_rate": 1.0345065956345506e-06, "logits/chosen": -1.8558422327041626, "logits/rejected": -1.8300154209136963, "logps/chosen": -116.79812622070312, "logps/rejected": -93.70114135742188, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": 8.303421020507812, "rewards/margins": 3.7376279830932617, "rewards/rejected": 4.565793037414551, "step": 7209 }, { "epoch": 1.6, "learning_rate": 1.0334151556567406e-06, "logits/chosen": -2.3052332401275635, "logits/rejected": -2.270862340927124, "logps/chosen": -55.19130325317383, "logps/rejected": -9.875632286071777, "loss": 0.1748, "rewards/accuracies": 1.0, "rewards/chosen": 2.4098637104034424, "rewards/margins": 1.778187870979309, "rewards/rejected": 0.6316758394241333, "step": 7210 }, { "epoch": 1.6, "learning_rate": 1.03232422537555e-06, "logits/chosen": -2.144237995147705, "logits/rejected": -2.077854871749878, "logps/chosen": -103.00859069824219, "logps/rejected": -70.24989318847656, "loss": 0.0649, "rewards/accuracies": 1.0, "rewards/chosen": 8.26655101776123, "rewards/margins": 5.311164855957031, "rewards/rejected": 2.9553864002227783, "step": 7211 }, { "epoch": 1.6, "learning_rate": 1.0312338049311577e-06, "logits/chosen": -1.7719043493270874, "logits/rejected": -1.773216962814331, "logps/chosen": -65.87069702148438, "logps/rejected": -62.71529769897461, "loss": 0.0974, "rewards/accuracies": 1.0, "rewards/chosen": 4.873446941375732, "rewards/margins": 1.6663451194763184, "rewards/rejected": 3.207101821899414, "step": 7212 }, { "epoch": 1.6, "learning_rate": 1.0301438944636799e-06, "logits/chosen": -1.799905776977539, "logits/rejected": -1.799905776977539, "logps/chosen": -17.483213424682617, "logps/rejected": -17.483213424682617, "loss": 1.0147, "rewards/accuracies": 0.0, "rewards/chosen": 3.2051186561584473, "rewards/margins": 0.0, "rewards/rejected": 3.2051186561584473, "step": 7213 }, { "epoch": 1.6, "learning_rate": 1.0290544941131692e-06, "logits/chosen": -1.8396096229553223, "logits/rejected": -1.7145376205444336, "logps/chosen": -42.15167236328125, "logps/rejected": -33.70998764038086, "loss": 0.256, "rewards/accuracies": 1.0, "rewards/chosen": 2.5201218128204346, "rewards/margins": 0.764054536819458, "rewards/rejected": 1.7560672760009766, "step": 7214 }, { "epoch": 1.6, "learning_rate": 1.0279656040196096e-06, "logits/chosen": -1.9805779457092285, "logits/rejected": -1.8279032707214355, "logps/chosen": -56.796775817871094, "logps/rejected": -19.535442352294922, "loss": 0.055, "rewards/accuracies": 1.0, "rewards/chosen": 5.034554958343506, "rewards/margins": 4.136646747589111, "rewards/rejected": 0.8979080319404602, "step": 7215 }, { "epoch": 1.6, "learning_rate": 1.026877224322923e-06, "logits/chosen": -1.776050329208374, "logits/rejected": -1.8359999656677246, "logps/chosen": -28.798782348632812, "logps/rejected": -88.82662200927734, "loss": 0.2717, "rewards/accuracies": 1.0, "rewards/chosen": 3.4926018714904785, "rewards/margins": 0.44544076919555664, "rewards/rejected": 3.047161102294922, "step": 7216 }, { "epoch": 1.6, "learning_rate": 1.0257893551629588e-06, "logits/chosen": -1.7556394338607788, "logits/rejected": -1.7803382873535156, "logps/chosen": -173.37815856933594, "logps/rejected": -150.51539611816406, "loss": 0.0616, "rewards/accuracies": 1.0, "rewards/chosen": 10.520247459411621, "rewards/margins": 2.323500633239746, "rewards/rejected": 8.196746826171875, "step": 7217 }, { "epoch": 1.6, "learning_rate": 1.0247019966795118e-06, "logits/chosen": -1.7955244779586792, "logits/rejected": -1.7472606897354126, "logps/chosen": -40.752994537353516, "logps/rejected": -116.04383850097656, "loss": 0.0884, "rewards/accuracies": 1.0, "rewards/chosen": 9.429471015930176, "rewards/margins": 1.9664230346679688, "rewards/rejected": 7.463047981262207, "step": 7218 }, { "epoch": 1.6, "learning_rate": 1.0236151490122992e-06, "logits/chosen": -1.845689058303833, "logits/rejected": -1.7465327978134155, "logps/chosen": -57.48129653930664, "logps/rejected": -18.4674015045166, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 4.212657451629639, "rewards/margins": 2.304560899734497, "rewards/rejected": 1.9080965518951416, "step": 7219 }, { "epoch": 1.6, "learning_rate": 1.0225288123009835e-06, "logits/chosen": -2.431975841522217, "logits/rejected": -2.44736647605896, "logps/chosen": -42.237117767333984, "logps/rejected": -71.8880615234375, "loss": 0.2617, "rewards/accuracies": 1.0, "rewards/chosen": 4.28471040725708, "rewards/margins": 0.5300023555755615, "rewards/rejected": 3.7547080516815186, "step": 7220 }, { "epoch": 1.6, "learning_rate": 1.0214429866851511e-06, "logits/chosen": -1.8524219989776611, "logits/rejected": -1.8908307552337646, "logps/chosen": -68.36981201171875, "logps/rejected": -92.76443481445312, "loss": 0.2228, "rewards/accuracies": 1.0, "rewards/chosen": 7.616374492645264, "rewards/margins": 0.7208847999572754, "rewards/rejected": 6.895489692687988, "step": 7221 }, { "epoch": 1.6, "learning_rate": 1.020357672304334e-06, "logits/chosen": -2.068509817123413, "logits/rejected": -2.043668508529663, "logps/chosen": -50.18804931640625, "logps/rejected": -41.62709045410156, "loss": 0.1707, "rewards/accuracies": 1.0, "rewards/chosen": 3.1942596435546875, "rewards/margins": 0.9226303100585938, "rewards/rejected": 2.2716293334960938, "step": 7222 }, { "epoch": 1.6, "learning_rate": 1.019272869297987e-06, "logits/chosen": -1.8891675472259521, "logits/rejected": -1.8891675472259521, "logps/chosen": -16.27246856689453, "logps/rejected": -16.27246856689453, "loss": 0.3484, "rewards/accuracies": 0.0, "rewards/chosen": 1.977976679801941, "rewards/margins": 0.0, "rewards/rejected": 1.977976679801941, "step": 7223 }, { "epoch": 1.6, "learning_rate": 1.0181885778055074e-06, "logits/chosen": -2.0054728984832764, "logits/rejected": -1.9219452142715454, "logps/chosen": -49.668121337890625, "logps/rejected": -17.47696876525879, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 2.78985595703125, "rewards/margins": 2.405778408050537, "rewards/rejected": 0.38407763838768005, "step": 7224 }, { "epoch": 1.6, "learning_rate": 1.0171047979662252e-06, "logits/chosen": -1.5596169233322144, "logits/rejected": -1.5507525205612183, "logps/chosen": -43.541908264160156, "logps/rejected": -57.06353759765625, "loss": 1.3165, "rewards/accuracies": 0.0, "rewards/chosen": 2.823777914047241, "rewards/margins": -2.44523024559021, "rewards/rejected": 5.269008159637451, "step": 7225 }, { "epoch": 1.6, "learning_rate": 1.0160215299193987e-06, "logits/chosen": -2.287365436553955, "logits/rejected": -2.2612295150756836, "logps/chosen": -45.600494384765625, "logps/rejected": -71.29833984375, "loss": 0.3503, "rewards/accuracies": 1.0, "rewards/chosen": 4.738173961639404, "rewards/margins": 0.25078296661376953, "rewards/rejected": 4.487390995025635, "step": 7226 }, { "epoch": 1.6, "learning_rate": 1.014938773804231e-06, "logits/chosen": -1.9512455463409424, "logits/rejected": -1.9512455463409424, "logps/chosen": -35.756507873535156, "logps/rejected": -35.756507873535156, "loss": 0.5145, "rewards/accuracies": 0.0, "rewards/chosen": 5.291896343231201, "rewards/margins": 0.0, "rewards/rejected": 5.291896343231201, "step": 7227 }, { "epoch": 1.6, "learning_rate": 1.0138565297598486e-06, "logits/chosen": -2.227522611618042, "logits/rejected": -2.214322566986084, "logps/chosen": -108.28350830078125, "logps/rejected": -58.43988037109375, "loss": 0.0664, "rewards/accuracies": 1.0, "rewards/chosen": 9.834327697753906, "rewards/margins": 5.980944633483887, "rewards/rejected": 3.8533828258514404, "step": 7228 }, { "epoch": 1.6, "learning_rate": 1.012774797925322e-06, "logits/chosen": -1.8646116256713867, "logits/rejected": -1.8646116256713867, "logps/chosen": -26.859375, "logps/rejected": -26.859375, "loss": 0.9042, "rewards/accuracies": 0.0, "rewards/chosen": 2.681810140609741, "rewards/margins": 0.0, "rewards/rejected": 2.681810140609741, "step": 7229 }, { "epoch": 1.6, "learning_rate": 1.0116935784396482e-06, "logits/chosen": -2.04882550239563, "logits/rejected": -2.04882550239563, "logps/chosen": -46.065364837646484, "logps/rejected": -46.065364837646484, "loss": 0.358, "rewards/accuracies": 0.0, "rewards/chosen": 6.243181228637695, "rewards/margins": 0.0, "rewards/rejected": 6.243181228637695, "step": 7230 }, { "epoch": 1.6, "learning_rate": 1.0106128714417613e-06, "logits/chosen": -2.2001099586486816, "logits/rejected": -2.100451707839966, "logps/chosen": -172.74380493164062, "logps/rejected": -20.516307830810547, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 9.567008018493652, "rewards/margins": 8.920930862426758, "rewards/rejected": 0.646077573299408, "step": 7231 }, { "epoch": 1.6, "learning_rate": 1.0095326770705304e-06, "logits/chosen": -1.6171026229858398, "logits/rejected": -1.4516690969467163, "logps/chosen": -68.4968032836914, "logps/rejected": -24.65320587158203, "loss": 0.1528, "rewards/accuracies": 1.0, "rewards/chosen": 1.5311638116836548, "rewards/margins": 1.1055618524551392, "rewards/rejected": 0.4256019592285156, "step": 7232 }, { "epoch": 1.6, "learning_rate": 1.0084529954647577e-06, "logits/chosen": -1.8078845739364624, "logits/rejected": -1.8069411516189575, "logps/chosen": -49.07025146484375, "logps/rejected": -58.98028564453125, "loss": 0.3003, "rewards/accuracies": 1.0, "rewards/chosen": 3.9233062267303467, "rewards/margins": 0.4189422130584717, "rewards/rejected": 3.504364013671875, "step": 7233 }, { "epoch": 1.6, "learning_rate": 1.0073738267631811e-06, "logits/chosen": -1.8503748178482056, "logits/rejected": -1.8039301633834839, "logps/chosen": -62.1191291809082, "logps/rejected": -41.097320556640625, "loss": 0.3122, "rewards/accuracies": 1.0, "rewards/chosen": 2.88214373588562, "rewards/margins": 0.7468714714050293, "rewards/rejected": 2.135272264480591, "step": 7234 }, { "epoch": 1.6, "learning_rate": 1.006295171104467e-06, "logits/chosen": -2.052156686782837, "logits/rejected": -2.016744613647461, "logps/chosen": -89.85020446777344, "logps/rejected": -63.50469970703125, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": 6.8562822341918945, "rewards/margins": 2.361659526824951, "rewards/rejected": 4.494622707366943, "step": 7235 }, { "epoch": 1.6, "learning_rate": 1.0052170286272256e-06, "logits/chosen": -1.9543206691741943, "logits/rejected": -1.9664461612701416, "logps/chosen": -28.673133850097656, "logps/rejected": -46.147972106933594, "loss": 0.8612, "rewards/accuracies": 0.0, "rewards/chosen": 4.027554988861084, "rewards/margins": -1.0565719604492188, "rewards/rejected": 5.084126949310303, "step": 7236 }, { "epoch": 1.6, "learning_rate": 1.00413939946999e-06, "logits/chosen": -1.7433578968048096, "logits/rejected": -1.802744746208191, "logps/chosen": -20.094005584716797, "logps/rejected": -66.63258361816406, "loss": 1.2211, "rewards/accuracies": 0.0, "rewards/chosen": 4.476630210876465, "rewards/margins": -2.350898265838623, "rewards/rejected": 6.827528476715088, "step": 7237 }, { "epoch": 1.6, "learning_rate": 1.0030622837712384e-06, "logits/chosen": -2.1247708797454834, "logits/rejected": -2.064654588699341, "logps/chosen": -160.17098999023438, "logps/rejected": -78.8128662109375, "loss": 0.0834, "rewards/accuracies": 1.0, "rewards/chosen": 5.949324131011963, "rewards/margins": 2.5781610012054443, "rewards/rejected": 3.3711631298065186, "step": 7238 }, { "epoch": 1.6, "learning_rate": 1.0019856816693734e-06, "logits/chosen": -2.0911502838134766, "logits/rejected": -2.1084401607513428, "logps/chosen": -67.78670501708984, "logps/rejected": -108.70440673828125, "loss": 0.3175, "rewards/accuracies": 1.0, "rewards/chosen": 9.285505294799805, "rewards/margins": 0.5143547058105469, "rewards/rejected": 8.771150588989258, "step": 7239 }, { "epoch": 1.6, "learning_rate": 1.000909593302738e-06, "logits/chosen": -1.749500036239624, "logits/rejected": -1.7881678342819214, "logps/chosen": -29.837013244628906, "logps/rejected": -110.6434555053711, "loss": 1.3966, "rewards/accuracies": 0.0, "rewards/chosen": 6.7770280838012695, "rewards/margins": -2.711843490600586, "rewards/rejected": 9.488871574401855, "step": 7240 }, { "epoch": 1.6, "learning_rate": 9.998340188096063e-07, "logits/chosen": -2.048253059387207, "logits/rejected": -1.9719005823135376, "logps/chosen": -71.80183410644531, "logps/rejected": -81.4700927734375, "loss": 0.1392, "rewards/accuracies": 1.0, "rewards/chosen": 5.791446208953857, "rewards/margins": 1.1478214263916016, "rewards/rejected": 4.643624782562256, "step": 7241 }, { "epoch": 1.6, "learning_rate": 9.987589583281866e-07, "logits/chosen": -1.8390753269195557, "logits/rejected": -1.433543086051941, "logps/chosen": -89.89501190185547, "logps/rejected": -146.4854736328125, "loss": 0.0905, "rewards/accuracies": 1.0, "rewards/chosen": 13.864584922790527, "rewards/margins": 5.160330772399902, "rewards/rejected": 8.704254150390625, "step": 7242 }, { "epoch": 1.6, "learning_rate": 9.976844119966233e-07, "logits/chosen": -1.9896252155303955, "logits/rejected": -1.9784350395202637, "logps/chosen": -34.261775970458984, "logps/rejected": -43.208892822265625, "loss": 0.6857, "rewards/accuracies": 0.0, "rewards/chosen": 3.1979236602783203, "rewards/margins": -1.0017728805541992, "rewards/rejected": 4.1996965408325195, "step": 7243 }, { "epoch": 1.6, "learning_rate": 9.966103799529891e-07, "logits/chosen": -1.9706132411956787, "logits/rejected": -1.9706132411956787, "logps/chosen": -31.918977737426758, "logps/rejected": -31.918977737426758, "loss": 0.3472, "rewards/accuracies": 0.0, "rewards/chosen": 4.682647705078125, "rewards/margins": 0.0, "rewards/rejected": 4.682647705078125, "step": 7244 }, { "epoch": 1.6, "learning_rate": 9.955368623353e-07, "logits/chosen": -2.064345598220825, "logits/rejected": -2.0647518634796143, "logps/chosen": -48.89836883544922, "logps/rejected": -75.90272521972656, "loss": 0.4576, "rewards/accuracies": 0.0, "rewards/chosen": 4.04614782333374, "rewards/margins": -0.3972787857055664, "rewards/rejected": 4.443426609039307, "step": 7245 }, { "epoch": 1.6, "learning_rate": 9.944638592814954e-07, "logits/chosen": -1.7888877391815186, "logits/rejected": -1.8895726203918457, "logps/chosen": -55.343955993652344, "logps/rejected": -151.18679809570312, "loss": 0.5672, "rewards/accuracies": 0.0, "rewards/chosen": 8.623162269592285, "rewards/margins": -0.09177303314208984, "rewards/rejected": 8.714935302734375, "step": 7246 }, { "epoch": 1.6, "learning_rate": 9.933913709294579e-07, "logits/chosen": -1.8374276161193848, "logits/rejected": -1.3782671689987183, "logps/chosen": -35.819496154785156, "logps/rejected": -34.429100036621094, "loss": 0.546, "rewards/accuracies": 1.0, "rewards/chosen": 4.012367248535156, "rewards/margins": 0.5204260349273682, "rewards/rejected": 3.491941213607788, "step": 7247 }, { "epoch": 1.6, "learning_rate": 9.923193974169964e-07, "logits/chosen": -1.9421837329864502, "logits/rejected": -1.7591472864151, "logps/chosen": -70.75592041015625, "logps/rejected": -21.24789047241211, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": 3.7380471229553223, "rewards/margins": 2.756361722946167, "rewards/rejected": 0.9816854596138, "step": 7248 }, { "epoch": 1.6, "learning_rate": 9.912479388818585e-07, "logits/chosen": -1.9375985860824585, "logits/rejected": -1.8164288997650146, "logps/chosen": -51.27983856201172, "logps/rejected": -29.82947540283203, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 3.0699410438537598, "rewards/margins": 2.9512321949005127, "rewards/rejected": 0.11870880424976349, "step": 7249 }, { "epoch": 1.6, "learning_rate": 9.901769954617235e-07, "logits/chosen": -1.731589674949646, "logits/rejected": -1.6727821826934814, "logps/chosen": -50.76133728027344, "logps/rejected": -32.40424346923828, "loss": 0.3842, "rewards/accuracies": 1.0, "rewards/chosen": 3.480703115463257, "rewards/margins": 0.03653883934020996, "rewards/rejected": 3.444164276123047, "step": 7250 }, { "epoch": 1.6, "learning_rate": 9.891065672942052e-07, "logits/chosen": -1.9135560989379883, "logits/rejected": -1.59689199924469, "logps/chosen": -85.5625, "logps/rejected": -40.14224624633789, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 7.6101837158203125, "rewards/margins": 7.983448505401611, "rewards/rejected": -0.37326470017433167, "step": 7251 }, { "epoch": 1.61, "learning_rate": 9.880366545168507e-07, "logits/chosen": -2.0777716636657715, "logits/rejected": -2.0777716636657715, "logps/chosen": -28.882658004760742, "logps/rejected": -28.882658004760742, "loss": 0.3793, "rewards/accuracies": 0.0, "rewards/chosen": 3.001721143722534, "rewards/margins": 0.0, "rewards/rejected": 3.001721143722534, "step": 7252 }, { "epoch": 1.61, "learning_rate": 9.869672572671419e-07, "logits/chosen": -1.8147639036178589, "logits/rejected": -1.6703935861587524, "logps/chosen": -46.88816452026367, "logps/rejected": -30.218997955322266, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 3.9159481525421143, "rewards/margins": 2.052499294281006, "rewards/rejected": 1.8634487390518188, "step": 7253 }, { "epoch": 1.61, "learning_rate": 9.858983756824952e-07, "logits/chosen": -1.9727340936660767, "logits/rejected": -1.917001724243164, "logps/chosen": -38.374839782714844, "logps/rejected": -76.49092864990234, "loss": 0.1769, "rewards/accuracies": 1.0, "rewards/chosen": 3.4049134254455566, "rewards/margins": 1.5630470514297485, "rewards/rejected": 1.841866374015808, "step": 7254 }, { "epoch": 1.61, "learning_rate": 9.848300099002546e-07, "logits/chosen": -1.6536579132080078, "logits/rejected": -1.5851958990097046, "logps/chosen": -44.53382110595703, "logps/rejected": -33.25562286376953, "loss": 0.1311, "rewards/accuracies": 1.0, "rewards/chosen": 4.160715579986572, "rewards/margins": 1.2096984386444092, "rewards/rejected": 2.951017141342163, "step": 7255 }, { "epoch": 1.61, "learning_rate": 9.83762160057708e-07, "logits/chosen": -2.051520824432373, "logits/rejected": -2.0657832622528076, "logps/chosen": -43.58325958251953, "logps/rejected": -101.45591735839844, "loss": 0.6434, "rewards/accuracies": 0.0, "rewards/chosen": 5.786017894744873, "rewards/margins": -0.9487781524658203, "rewards/rejected": 6.734796047210693, "step": 7256 }, { "epoch": 1.61, "learning_rate": 9.826948262920678e-07, "logits/chosen": -2.134446382522583, "logits/rejected": -2.1099002361297607, "logps/chosen": -54.00409698486328, "logps/rejected": -66.63450622558594, "loss": 0.2438, "rewards/accuracies": 1.0, "rewards/chosen": 1.348401665687561, "rewards/margins": 0.47333985567092896, "rewards/rejected": 0.8750618100166321, "step": 7257 }, { "epoch": 1.61, "learning_rate": 9.816280087404851e-07, "logits/chosen": -2.0072269439697266, "logits/rejected": -1.967582106590271, "logps/chosen": -72.82327270507812, "logps/rejected": -149.52572631835938, "loss": 0.084, "rewards/accuracies": 1.0, "rewards/chosen": 9.010614395141602, "rewards/margins": 1.962564468383789, "rewards/rejected": 7.0480499267578125, "step": 7258 }, { "epoch": 1.61, "learning_rate": 9.805617075400437e-07, "logits/chosen": -1.6003566980361938, "logits/rejected": -1.6512314081192017, "logps/chosen": -29.11214256286621, "logps/rejected": -75.9677734375, "loss": 1.0127, "rewards/accuracies": 0.0, "rewards/chosen": 2.515214681625366, "rewards/margins": -1.5246756076812744, "rewards/rejected": 4.039890289306641, "step": 7259 }, { "epoch": 1.61, "learning_rate": 9.794959228277601e-07, "logits/chosen": -1.9038264751434326, "logits/rejected": -1.8165823221206665, "logps/chosen": -89.15509796142578, "logps/rejected": -41.40468215942383, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 6.2255120277404785, "rewards/margins": 3.0565853118896484, "rewards/rejected": 3.16892671585083, "step": 7260 }, { "epoch": 1.61, "learning_rate": 9.784306547405853e-07, "logits/chosen": -1.7531172037124634, "logits/rejected": -1.7578610181808472, "logps/chosen": -30.704952239990234, "logps/rejected": -72.85476684570312, "loss": 0.2384, "rewards/accuracies": 1.0, "rewards/chosen": 3.6840062141418457, "rewards/margins": 0.7170307636260986, "rewards/rejected": 2.966975450515747, "step": 7261 }, { "epoch": 1.61, "learning_rate": 9.773659034154043e-07, "logits/chosen": -1.9096938371658325, "logits/rejected": -1.9067702293395996, "logps/chosen": -73.66706085205078, "logps/rejected": -69.56912231445312, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": 6.852468013763428, "rewards/margins": 1.6722183227539062, "rewards/rejected": 5.1802496910095215, "step": 7262 }, { "epoch": 1.61, "learning_rate": 9.763016689890353e-07, "logits/chosen": -2.094484567642212, "logits/rejected": -2.1168148517608643, "logps/chosen": -63.51551055908203, "logps/rejected": -64.99571228027344, "loss": 1.0549, "rewards/accuracies": 0.0, "rewards/chosen": 4.756030559539795, "rewards/margins": -1.8117046356201172, "rewards/rejected": 6.567735195159912, "step": 7263 }, { "epoch": 1.61, "learning_rate": 9.752379515982269e-07, "logits/chosen": -1.9817062616348267, "logits/rejected": -1.9817062616348267, "logps/chosen": -64.14933776855469, "logps/rejected": -64.14933776855469, "loss": 0.5616, "rewards/accuracies": 0.0, "rewards/chosen": 3.469674825668335, "rewards/margins": 0.0, "rewards/rejected": 3.469674825668335, "step": 7264 }, { "epoch": 1.61, "learning_rate": 9.74174751379669e-07, "logits/chosen": -1.8937931060791016, "logits/rejected": -1.7548768520355225, "logps/chosen": -164.52374267578125, "logps/rejected": -42.50834274291992, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 7.618782043457031, "rewards/margins": 5.243595123291016, "rewards/rejected": 2.3751866817474365, "step": 7265 }, { "epoch": 1.61, "learning_rate": 9.731120684699762e-07, "logits/chosen": -1.737485647201538, "logits/rejected": -1.7012859582901, "logps/chosen": -34.79804229736328, "logps/rejected": -62.42353439331055, "loss": 0.7604, "rewards/accuracies": 1.0, "rewards/chosen": 5.169834136962891, "rewards/margins": 2.3920443058013916, "rewards/rejected": 2.777789831161499, "step": 7266 }, { "epoch": 1.61, "learning_rate": 9.720499030057023e-07, "logits/chosen": -1.9843411445617676, "logits/rejected": -1.8581745624542236, "logps/chosen": -39.08586502075195, "logps/rejected": -46.81840896606445, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": 2.4839794635772705, "rewards/margins": 2.0077831745147705, "rewards/rejected": 0.4761962890625, "step": 7267 }, { "epoch": 1.61, "learning_rate": 9.70988255123333e-07, "logits/chosen": -1.8605579137802124, "logits/rejected": -1.7422140836715698, "logps/chosen": -53.15459442138672, "logps/rejected": -57.59347152709961, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 3.59932017326355, "rewards/margins": 0.03606057167053223, "rewards/rejected": 3.5632596015930176, "step": 7268 }, { "epoch": 1.61, "learning_rate": 9.699271249592874e-07, "logits/chosen": -1.6770744323730469, "logits/rejected": -1.6232197284698486, "logps/chosen": -63.22237777709961, "logps/rejected": -39.83415222167969, "loss": 0.1943, "rewards/accuracies": 1.0, "rewards/chosen": 4.151988506317139, "rewards/margins": 1.1028940677642822, "rewards/rejected": 3.0490944385528564, "step": 7269 }, { "epoch": 1.61, "learning_rate": 9.688665126499187e-07, "logits/chosen": -2.243340492248535, "logits/rejected": -2.2414631843566895, "logps/chosen": -116.80976867675781, "logps/rejected": -76.6873779296875, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 12.137641906738281, "rewards/margins": 4.452998161315918, "rewards/rejected": 7.684643745422363, "step": 7270 }, { "epoch": 1.61, "learning_rate": 9.678064183315116e-07, "logits/chosen": -2.0963377952575684, "logits/rejected": -2.078667640686035, "logps/chosen": -26.294803619384766, "logps/rejected": -50.67179870605469, "loss": 0.708, "rewards/accuracies": 0.0, "rewards/chosen": 3.8137104511260986, "rewards/margins": -1.125304937362671, "rewards/rejected": 4.9390153884887695, "step": 7271 }, { "epoch": 1.61, "learning_rate": 9.66746842140287e-07, "logits/chosen": -1.9070522785186768, "logits/rejected": -1.868423342704773, "logps/chosen": -48.541412353515625, "logps/rejected": -36.09591293334961, "loss": 0.0844, "rewards/accuracies": 1.0, "rewards/chosen": 4.85528564453125, "rewards/margins": 1.801443099975586, "rewards/rejected": 3.053842544555664, "step": 7272 }, { "epoch": 1.61, "learning_rate": 9.65687784212398e-07, "logits/chosen": -2.127925395965576, "logits/rejected": -2.052379846572876, "logps/chosen": -57.4351692199707, "logps/rejected": -45.428466796875, "loss": 0.8177, "rewards/accuracies": 1.0, "rewards/chosen": 4.056171894073486, "rewards/margins": 1.3654887676239014, "rewards/rejected": 2.690683126449585, "step": 7273 }, { "epoch": 1.61, "learning_rate": 9.64629244683931e-07, "logits/chosen": -1.990654468536377, "logits/rejected": -1.4992518424987793, "logps/chosen": -100.62623596191406, "logps/rejected": -46.00379180908203, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 6.786403179168701, "rewards/margins": 3.784240961074829, "rewards/rejected": 3.002162218093872, "step": 7274 }, { "epoch": 1.61, "learning_rate": 9.63571223690905e-07, "logits/chosen": -2.0770914554595947, "logits/rejected": -2.0294108390808105, "logps/chosen": -70.40187072753906, "logps/rejected": -29.28118896484375, "loss": 0.4397, "rewards/accuracies": 1.0, "rewards/chosen": 4.203532695770264, "rewards/margins": 1.5855364799499512, "rewards/rejected": 2.6179962158203125, "step": 7275 }, { "epoch": 1.61, "learning_rate": 9.625137213692725e-07, "logits/chosen": -1.6670724153518677, "logits/rejected": -1.5454835891723633, "logps/chosen": -99.96725463867188, "logps/rejected": -63.774314880371094, "loss": 0.0543, "rewards/accuracies": 1.0, "rewards/chosen": 4.94697904586792, "rewards/margins": 2.298092842102051, "rewards/rejected": 2.648886203765869, "step": 7276 }, { "epoch": 1.61, "learning_rate": 9.614567378549217e-07, "logits/chosen": -2.0180087089538574, "logits/rejected": -1.9834661483764648, "logps/chosen": -53.784175872802734, "logps/rejected": -56.10496139526367, "loss": 0.889, "rewards/accuracies": 1.0, "rewards/chosen": 3.1950747966766357, "rewards/margins": 1.9188803434371948, "rewards/rejected": 1.276194453239441, "step": 7277 }, { "epoch": 1.61, "learning_rate": 9.604002732836715e-07, "logits/chosen": -1.7655678987503052, "logits/rejected": -1.6751590967178345, "logps/chosen": -56.08879852294922, "logps/rejected": -41.977622985839844, "loss": 0.2783, "rewards/accuracies": 1.0, "rewards/chosen": 6.217648983001709, "rewards/margins": 0.4287528991699219, "rewards/rejected": 5.788896083831787, "step": 7278 }, { "epoch": 1.61, "learning_rate": 9.593443277912751e-07, "logits/chosen": -1.6177680492401123, "logits/rejected": -1.6177680492401123, "logps/chosen": -11.174534797668457, "logps/rejected": -11.174534797668457, "loss": 0.3821, "rewards/accuracies": 0.0, "rewards/chosen": 1.2579238414764404, "rewards/margins": 0.0, "rewards/rejected": 1.2579238414764404, "step": 7279 }, { "epoch": 1.61, "learning_rate": 9.58288901513419e-07, "logits/chosen": -1.8554893732070923, "logits/rejected": -1.8572665452957153, "logps/chosen": -42.716552734375, "logps/rejected": -50.031280517578125, "loss": 0.3805, "rewards/accuracies": 1.0, "rewards/chosen": 3.8796546459198, "rewards/margins": 0.46690893173217773, "rewards/rejected": 3.412745714187622, "step": 7280 }, { "epoch": 1.61, "learning_rate": 9.572339945857228e-07, "logits/chosen": -2.1937341690063477, "logits/rejected": -2.092348337173462, "logps/chosen": -53.32899475097656, "logps/rejected": -25.129749298095703, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": 3.8575172424316406, "rewards/margins": 2.842188835144043, "rewards/rejected": 1.015328288078308, "step": 7281 }, { "epoch": 1.61, "learning_rate": 9.561796071437418e-07, "logits/chosen": -2.0765836238861084, "logits/rejected": -2.0167675018310547, "logps/chosen": -121.0275650024414, "logps/rejected": -78.97225952148438, "loss": 0.0733, "rewards/accuracies": 1.0, "rewards/chosen": 8.139354705810547, "rewards/margins": 2.6229820251464844, "rewards/rejected": 5.5163726806640625, "step": 7282 }, { "epoch": 1.61, "learning_rate": 9.551257393229568e-07, "logits/chosen": -1.960025668144226, "logits/rejected": -1.9666906595230103, "logps/chosen": -33.61183547973633, "logps/rejected": -42.92164611816406, "loss": 0.0965, "rewards/accuracies": 1.0, "rewards/chosen": 4.325056076049805, "rewards/margins": 1.619370698928833, "rewards/rejected": 2.7056853771209717, "step": 7283 }, { "epoch": 1.61, "learning_rate": 9.54072391258794e-07, "logits/chosen": -1.9570387601852417, "logits/rejected": -1.909498929977417, "logps/chosen": -72.3849868774414, "logps/rejected": -38.75351333618164, "loss": 0.1866, "rewards/accuracies": 1.0, "rewards/chosen": 3.4535775184631348, "rewards/margins": 0.8156535625457764, "rewards/rejected": 2.6379239559173584, "step": 7284 }, { "epoch": 1.61, "learning_rate": 9.530195630866012e-07, "logits/chosen": -2.1800386905670166, "logits/rejected": -2.1657981872558594, "logps/chosen": -55.5821533203125, "logps/rejected": -82.64461517333984, "loss": 0.1543, "rewards/accuracies": 1.0, "rewards/chosen": 5.6826090812683105, "rewards/margins": 1.2107195854187012, "rewards/rejected": 4.471889495849609, "step": 7285 }, { "epoch": 1.61, "learning_rate": 9.519672549416659e-07, "logits/chosen": -1.5272003412246704, "logits/rejected": -1.5372390747070312, "logps/chosen": -48.26919174194336, "logps/rejected": -49.50001525878906, "loss": 0.5201, "rewards/accuracies": 1.0, "rewards/chosen": 6.788414001464844, "rewards/margins": 0.2748856544494629, "rewards/rejected": 6.513528347015381, "step": 7286 }, { "epoch": 1.61, "learning_rate": 9.509154669592069e-07, "logits/chosen": -1.9924677610397339, "logits/rejected": -1.9924677610397339, "logps/chosen": -20.91608238220215, "logps/rejected": -20.91608238220215, "loss": 0.3552, "rewards/accuracies": 0.0, "rewards/chosen": 5.2488837242126465, "rewards/margins": 0.0, "rewards/rejected": 5.2488837242126465, "step": 7287 }, { "epoch": 1.61, "learning_rate": 9.498641992743773e-07, "logits/chosen": -2.0851855278015137, "logits/rejected": -2.017385244369507, "logps/chosen": -100.12959289550781, "logps/rejected": -50.06669616699219, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": 5.482801914215088, "rewards/margins": 2.2359306812286377, "rewards/rejected": 3.24687123298645, "step": 7288 }, { "epoch": 1.61, "learning_rate": 9.488134520222614e-07, "logits/chosen": -1.9542025327682495, "logits/rejected": -1.9506930112838745, "logps/chosen": -23.888690948486328, "logps/rejected": -31.86454963684082, "loss": 0.9637, "rewards/accuracies": 1.0, "rewards/chosen": 3.5947329998016357, "rewards/margins": 0.542119026184082, "rewards/rejected": 3.0526139736175537, "step": 7289 }, { "epoch": 1.61, "learning_rate": 9.477632253378777e-07, "logits/chosen": -1.8686808347702026, "logits/rejected": -1.8175294399261475, "logps/chosen": -27.222515106201172, "logps/rejected": -84.7977294921875, "loss": 0.3405, "rewards/accuracies": 1.0, "rewards/chosen": 5.12245512008667, "rewards/margins": 2.7282609939575195, "rewards/rejected": 2.3941941261291504, "step": 7290 }, { "epoch": 1.61, "learning_rate": 9.4671351935618e-07, "logits/chosen": -1.9438927173614502, "logits/rejected": -1.9438927173614502, "logps/chosen": -77.24046325683594, "logps/rejected": -77.24046325683594, "loss": 0.348, "rewards/accuracies": 0.0, "rewards/chosen": 10.855025291442871, "rewards/margins": 0.0, "rewards/rejected": 10.855025291442871, "step": 7291 }, { "epoch": 1.61, "learning_rate": 9.456643342120486e-07, "logits/chosen": -1.9493625164031982, "logits/rejected": -1.9588385820388794, "logps/chosen": -104.30143737792969, "logps/rejected": -177.62472534179688, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 12.929222106933594, "rewards/margins": 4.086677551269531, "rewards/rejected": 8.842544555664062, "step": 7292 }, { "epoch": 1.61, "learning_rate": 9.446156700403069e-07, "logits/chosen": -1.987149953842163, "logits/rejected": -1.9252617359161377, "logps/chosen": -80.1071548461914, "logps/rejected": -59.4343147277832, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 8.419637680053711, "rewards/margins": 4.095269680023193, "rewards/rejected": 4.324368000030518, "step": 7293 }, { "epoch": 1.61, "learning_rate": 9.435675269757011e-07, "logits/chosen": -1.7455159425735474, "logits/rejected": -1.7887189388275146, "logps/chosen": -21.739238739013672, "logps/rejected": -69.36753845214844, "loss": 1.1051, "rewards/accuracies": 0.0, "rewards/chosen": 4.057600021362305, "rewards/margins": -2.067955493927002, "rewards/rejected": 6.125555515289307, "step": 7294 }, { "epoch": 1.61, "learning_rate": 9.425199051529166e-07, "logits/chosen": -1.8296120166778564, "logits/rejected": -1.7898112535476685, "logps/chosen": -72.98509216308594, "logps/rejected": -61.584781646728516, "loss": 0.0807, "rewards/accuracies": 1.0, "rewards/chosen": 7.3236894607543945, "rewards/margins": 3.6886205673217773, "rewards/rejected": 3.635068893432617, "step": 7295 }, { "epoch": 1.61, "learning_rate": 9.414728047065702e-07, "logits/chosen": -1.881496548652649, "logits/rejected": -1.8834478855133057, "logps/chosen": -86.81350708007812, "logps/rejected": -48.667503356933594, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 5.040188789367676, "rewards/margins": 2.1101059913635254, "rewards/rejected": 2.9300827980041504, "step": 7296 }, { "epoch": 1.62, "learning_rate": 9.404262257712121e-07, "logits/chosen": -2.0023868083953857, "logits/rejected": -1.975728988647461, "logps/chosen": -64.3027114868164, "logps/rejected": -37.343536376953125, "loss": 0.1386, "rewards/accuracies": 1.0, "rewards/chosen": 4.0844316482543945, "rewards/margins": 1.3729326725006104, "rewards/rejected": 2.711498975753784, "step": 7297 }, { "epoch": 1.62, "learning_rate": 9.393801684813253e-07, "logits/chosen": -1.8574037551879883, "logits/rejected": -1.677253007888794, "logps/chosen": -201.08717346191406, "logps/rejected": -62.58653259277344, "loss": 0.2088, "rewards/accuracies": 1.0, "rewards/chosen": 9.254557609558105, "rewards/margins": 4.584844589233398, "rewards/rejected": 4.669713020324707, "step": 7298 }, { "epoch": 1.62, "learning_rate": 9.38334632971325e-07, "logits/chosen": -2.0428075790405273, "logits/rejected": -2.0876657962799072, "logps/chosen": -102.98455810546875, "logps/rejected": -120.03048706054688, "loss": 0.1596, "rewards/accuracies": 1.0, "rewards/chosen": 9.294024467468262, "rewards/margins": 1.324697494506836, "rewards/rejected": 7.969326972961426, "step": 7299 }, { "epoch": 1.62, "learning_rate": 9.372896193755621e-07, "logits/chosen": -2.092926263809204, "logits/rejected": -2.037945508956909, "logps/chosen": -37.53739547729492, "logps/rejected": -39.80208969116211, "loss": 0.1918, "rewards/accuracies": 1.0, "rewards/chosen": 3.9884746074676514, "rewards/margins": 1.5329961776733398, "rewards/rejected": 2.4554784297943115, "step": 7300 }, { "epoch": 1.62, "learning_rate": 9.362451278283136e-07, "logits/chosen": -1.9033772945404053, "logits/rejected": -1.9490259885787964, "logps/chosen": -32.53234100341797, "logps/rejected": -113.4744873046875, "loss": 1.2298, "rewards/accuracies": 0.0, "rewards/chosen": 4.774659156799316, "rewards/margins": -2.3563575744628906, "rewards/rejected": 7.131016731262207, "step": 7301 }, { "epoch": 1.62, "learning_rate": 9.352011584637999e-07, "logits/chosen": -1.8954644203186035, "logits/rejected": -1.8503048419952393, "logps/chosen": -47.57069396972656, "logps/rejected": -4.556336402893066, "loss": 0.1515, "rewards/accuracies": 1.0, "rewards/chosen": 3.681170701980591, "rewards/margins": 1.7614208459854126, "rewards/rejected": 1.9197498559951782, "step": 7302 }, { "epoch": 1.62, "learning_rate": 9.341577114161637e-07, "logits/chosen": -2.0527210235595703, "logits/rejected": -1.9505774974822998, "logps/chosen": -58.315765380859375, "logps/rejected": -27.63392448425293, "loss": 0.0564, "rewards/accuracies": 1.0, "rewards/chosen": 3.0989837646484375, "rewards/margins": 2.1547505855560303, "rewards/rejected": 0.9442331194877625, "step": 7303 }, { "epoch": 1.62, "learning_rate": 9.331147868194895e-07, "logits/chosen": -2.2748701572418213, "logits/rejected": -2.2906792163848877, "logps/chosen": -51.203399658203125, "logps/rejected": -134.38565063476562, "loss": 0.8309, "rewards/accuracies": 0.0, "rewards/chosen": 5.217595100402832, "rewards/margins": -1.4499220848083496, "rewards/rejected": 6.667517185211182, "step": 7304 }, { "epoch": 1.62, "learning_rate": 9.320723848077878e-07, "logits/chosen": -1.4596571922302246, "logits/rejected": -1.4246524572372437, "logps/chosen": -38.01423645019531, "logps/rejected": -36.13905334472656, "loss": 0.0749, "rewards/accuracies": 1.0, "rewards/chosen": 5.218869209289551, "rewards/margins": 2.0604448318481445, "rewards/rejected": 3.1584243774414062, "step": 7305 }, { "epoch": 1.62, "learning_rate": 9.310305055150054e-07, "logits/chosen": -1.8156113624572754, "logits/rejected": -1.9283018112182617, "logps/chosen": -120.28231048583984, "logps/rejected": -226.4365692138672, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 14.033514976501465, "rewards/margins": 4.889842987060547, "rewards/rejected": 9.143671989440918, "step": 7306 }, { "epoch": 1.62, "learning_rate": 9.299891490750218e-07, "logits/chosen": -2.103057861328125, "logits/rejected": -2.1116061210632324, "logps/chosen": -69.85588836669922, "logps/rejected": -48.76544952392578, "loss": 0.1806, "rewards/accuracies": 1.0, "rewards/chosen": 5.838913917541504, "rewards/margins": 1.0840048789978027, "rewards/rejected": 4.754909038543701, "step": 7307 }, { "epoch": 1.62, "learning_rate": 9.289483156216483e-07, "logits/chosen": -1.9090644121170044, "logits/rejected": -1.846835732460022, "logps/chosen": -60.29510498046875, "logps/rejected": -52.866188049316406, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": 6.865567207336426, "rewards/margins": 1.841498851776123, "rewards/rejected": 5.024068355560303, "step": 7308 }, { "epoch": 1.62, "learning_rate": 9.279080052886308e-07, "logits/chosen": -1.9019033908843994, "logits/rejected": -1.897482991218567, "logps/chosen": -60.451416015625, "logps/rejected": -46.05421447753906, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": 2.607875108718872, "rewards/margins": 1.7029595375061035, "rewards/rejected": 0.9049156308174133, "step": 7309 }, { "epoch": 1.62, "learning_rate": 9.268682182096434e-07, "logits/chosen": -2.0211615562438965, "logits/rejected": -1.9565610885620117, "logps/chosen": -60.37165832519531, "logps/rejected": -10.701650619506836, "loss": 0.0487, "rewards/accuracies": 1.0, "rewards/chosen": 3.744464874267578, "rewards/margins": 2.98962140083313, "rewards/rejected": 0.754843533039093, "step": 7310 }, { "epoch": 1.62, "learning_rate": 9.258289545183008e-07, "logits/chosen": -1.9805651903152466, "logits/rejected": -1.9436185359954834, "logps/chosen": -123.27588653564453, "logps/rejected": -118.09508514404297, "loss": 0.1034, "rewards/accuracies": 1.0, "rewards/chosen": 11.079044342041016, "rewards/margins": 1.5028820037841797, "rewards/rejected": 9.576162338256836, "step": 7311 }, { "epoch": 1.62, "learning_rate": 9.247902143481408e-07, "logits/chosen": -1.824224829673767, "logits/rejected": -1.7864362001419067, "logps/chosen": -47.36726760864258, "logps/rejected": -16.66399383544922, "loss": 0.4159, "rewards/accuracies": 1.0, "rewards/chosen": 6.170181751251221, "rewards/margins": 3.72497820854187, "rewards/rejected": 2.4452035427093506, "step": 7312 }, { "epoch": 1.62, "learning_rate": 9.237519978326443e-07, "logits/chosen": -1.8591876029968262, "logits/rejected": -1.7335174083709717, "logps/chosen": -57.20029067993164, "logps/rejected": -8.897157669067383, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 6.67819356918335, "rewards/margins": 5.816858768463135, "rewards/rejected": 0.8613349795341492, "step": 7313 }, { "epoch": 1.62, "learning_rate": 9.227143051052162e-07, "logits/chosen": -1.8580609560012817, "logits/rejected": -1.8843920230865479, "logps/chosen": -25.164539337158203, "logps/rejected": -126.8006820678711, "loss": 1.4659, "rewards/accuracies": 0.0, "rewards/chosen": 3.578207015991211, "rewards/margins": -2.125863552093506, "rewards/rejected": 5.704070568084717, "step": 7314 }, { "epoch": 1.62, "learning_rate": 9.216771362991977e-07, "logits/chosen": -1.6702346801757812, "logits/rejected": -1.5823795795440674, "logps/chosen": -38.15287780761719, "logps/rejected": -24.6251163482666, "loss": 0.2463, "rewards/accuracies": 1.0, "rewards/chosen": 3.4386003017425537, "rewards/margins": 2.6187992095947266, "rewards/rejected": 0.8198011517524719, "step": 7315 }, { "epoch": 1.62, "learning_rate": 9.206404915478639e-07, "logits/chosen": -2.0367801189422607, "logits/rejected": -1.9329512119293213, "logps/chosen": -45.946693420410156, "logps/rejected": -22.90240478515625, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": 3.836174726486206, "rewards/margins": 3.7953546047210693, "rewards/rejected": 0.04082012176513672, "step": 7316 }, { "epoch": 1.62, "learning_rate": 9.196043709844199e-07, "logits/chosen": -1.6732783317565918, "logits/rejected": -1.6655536890029907, "logps/chosen": -39.062255859375, "logps/rejected": -34.68159484863281, "loss": 0.3551, "rewards/accuracies": 1.0, "rewards/chosen": 4.688875675201416, "rewards/margins": 0.19586801528930664, "rewards/rejected": 4.493007659912109, "step": 7317 }, { "epoch": 1.62, "learning_rate": 9.185687747420069e-07, "logits/chosen": -2.034477710723877, "logits/rejected": -1.9738751649856567, "logps/chosen": -63.750205993652344, "logps/rejected": -30.76696014404297, "loss": 0.3363, "rewards/accuracies": 1.0, "rewards/chosen": 3.8811707496643066, "rewards/margins": 0.9136173725128174, "rewards/rejected": 2.9675533771514893, "step": 7318 }, { "epoch": 1.62, "learning_rate": 9.175337029536918e-07, "logits/chosen": -2.0126900672912598, "logits/rejected": -2.0462584495544434, "logps/chosen": -20.077693939208984, "logps/rejected": -83.55874633789062, "loss": 1.1826, "rewards/accuracies": 0.0, "rewards/chosen": 3.036674976348877, "rewards/margins": -2.0059046745300293, "rewards/rejected": 5.042579650878906, "step": 7319 }, { "epoch": 1.62, "learning_rate": 9.164991557524849e-07, "logits/chosen": -1.753822684288025, "logits/rejected": -1.6848945617675781, "logps/chosen": -85.39021301269531, "logps/rejected": -44.73844909667969, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 5.698176860809326, "rewards/margins": 3.390197992324829, "rewards/rejected": 2.307978868484497, "step": 7320 }, { "epoch": 1.62, "learning_rate": 9.154651332713172e-07, "logits/chosen": -2.137545108795166, "logits/rejected": -2.1121397018432617, "logps/chosen": -42.625511169433594, "logps/rejected": -35.16947937011719, "loss": 0.2657, "rewards/accuracies": 1.0, "rewards/chosen": 2.994396924972534, "rewards/margins": 0.3630180358886719, "rewards/rejected": 2.6313788890838623, "step": 7321 }, { "epoch": 1.62, "learning_rate": 9.144316356430632e-07, "logits/chosen": -1.809128999710083, "logits/rejected": -1.7376258373260498, "logps/chosen": -40.9465217590332, "logps/rejected": -48.72337341308594, "loss": 0.7193, "rewards/accuracies": 0.0, "rewards/chosen": 4.190650463104248, "rewards/margins": -0.1020956039428711, "rewards/rejected": 4.292746067047119, "step": 7322 }, { "epoch": 1.62, "learning_rate": 9.13398663000522e-07, "logits/chosen": -1.9814587831497192, "logits/rejected": -1.9389878511428833, "logps/chosen": -51.302330017089844, "logps/rejected": -30.52414894104004, "loss": 0.262, "rewards/accuracies": 1.0, "rewards/chosen": 3.5307579040527344, "rewards/margins": 0.6597681045532227, "rewards/rejected": 2.8709897994995117, "step": 7323 }, { "epoch": 1.62, "learning_rate": 9.123662154764285e-07, "logits/chosen": -1.9208203554153442, "logits/rejected": -1.840477705001831, "logps/chosen": -28.254623413085938, "logps/rejected": -26.41222381591797, "loss": 0.2659, "rewards/accuracies": 1.0, "rewards/chosen": 3.1230437755584717, "rewards/margins": 1.6235594749450684, "rewards/rejected": 1.4994843006134033, "step": 7324 }, { "epoch": 1.62, "learning_rate": 9.113342932034496e-07, "logits/chosen": -2.191678762435913, "logits/rejected": -2.163724422454834, "logps/chosen": -109.83171844482422, "logps/rejected": -184.3563690185547, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 13.075332641601562, "rewards/margins": 3.466019630432129, "rewards/rejected": 9.609313011169434, "step": 7325 }, { "epoch": 1.62, "learning_rate": 9.103028963141852e-07, "logits/chosen": -1.6632041931152344, "logits/rejected": -1.6420087814331055, "logps/chosen": -43.571075439453125, "logps/rejected": -56.474761962890625, "loss": 0.1085, "rewards/accuracies": 1.0, "rewards/chosen": 4.570023536682129, "rewards/margins": 2.019284248352051, "rewards/rejected": 2.550739288330078, "step": 7326 }, { "epoch": 1.62, "learning_rate": 9.092720249411685e-07, "logits/chosen": -2.1018362045288086, "logits/rejected": -2.055306911468506, "logps/chosen": -35.46747589111328, "logps/rejected": -32.33782958984375, "loss": 0.3804, "rewards/accuracies": 1.0, "rewards/chosen": 3.6630523204803467, "rewards/margins": 0.1260061264038086, "rewards/rejected": 3.537046194076538, "step": 7327 }, { "epoch": 1.62, "learning_rate": 9.082416792168608e-07, "logits/chosen": -1.9290215969085693, "logits/rejected": -1.8560864925384521, "logps/chosen": -41.29322814941406, "logps/rejected": -35.27677917480469, "loss": 0.2619, "rewards/accuracies": 1.0, "rewards/chosen": 4.1157121658325195, "rewards/margins": 3.1251254081726074, "rewards/rejected": 0.9905868768692017, "step": 7328 }, { "epoch": 1.62, "learning_rate": 9.072118592736628e-07, "logits/chosen": -1.9923474788665771, "logits/rejected": -2.0189743041992188, "logps/chosen": -36.01042556762695, "logps/rejected": -39.237060546875, "loss": 0.719, "rewards/accuracies": 0.0, "rewards/chosen": 3.687530279159546, "rewards/margins": -1.0573551654815674, "rewards/rejected": 4.744885444641113, "step": 7329 }, { "epoch": 1.62, "learning_rate": 9.061825652438999e-07, "logits/chosen": -2.0458648204803467, "logits/rejected": -1.9114726781845093, "logps/chosen": -106.9527587890625, "logps/rejected": -71.50564575195312, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 8.691081047058105, "rewards/margins": 6.153282165527344, "rewards/rejected": 2.537799119949341, "step": 7330 }, { "epoch": 1.62, "learning_rate": 9.051537972598389e-07, "logits/chosen": -1.7592737674713135, "logits/rejected": -1.7598321437835693, "logps/chosen": -41.2031364440918, "logps/rejected": -52.86724090576172, "loss": 0.8011, "rewards/accuracies": 0.0, "rewards/chosen": 3.6073596477508545, "rewards/margins": -1.3527371883392334, "rewards/rejected": 4.960096836090088, "step": 7331 }, { "epoch": 1.62, "learning_rate": 9.041255554536693e-07, "logits/chosen": -1.9323194026947021, "logits/rejected": -1.904598593711853, "logps/chosen": -37.824928283691406, "logps/rejected": -48.46028137207031, "loss": 0.3553, "rewards/accuracies": 1.0, "rewards/chosen": 3.746187686920166, "rewards/margins": 0.13378310203552246, "rewards/rejected": 3.6124045848846436, "step": 7332 }, { "epoch": 1.62, "learning_rate": 9.030978399575202e-07, "logits/chosen": -2.2826075553894043, "logits/rejected": -2.269731283187866, "logps/chosen": -48.61109924316406, "logps/rejected": -77.66295623779297, "loss": 0.1435, "rewards/accuracies": 1.0, "rewards/chosen": 6.044486999511719, "rewards/margins": 1.1208405494689941, "rewards/rejected": 4.923646450042725, "step": 7333 }, { "epoch": 1.62, "learning_rate": 9.0207065090345e-07, "logits/chosen": -1.9116989374160767, "logits/rejected": -1.783523678779602, "logps/chosen": -103.0863037109375, "logps/rejected": -67.05027770996094, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": 6.646928310394287, "rewards/margins": 4.699227809906006, "rewards/rejected": 1.9477005004882812, "step": 7334 }, { "epoch": 1.62, "learning_rate": 9.010439884234495e-07, "logits/chosen": -2.1496002674102783, "logits/rejected": -2.09535813331604, "logps/chosen": -88.1048355102539, "logps/rejected": -57.38771438598633, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": 7.635865211486816, "rewards/margins": 3.047008991241455, "rewards/rejected": 4.588856220245361, "step": 7335 }, { "epoch": 1.62, "learning_rate": 9.000178526494441e-07, "logits/chosen": -1.9731075763702393, "logits/rejected": -1.9731075763702393, "logps/chosen": -20.254812240600586, "logps/rejected": -20.254812240600586, "loss": 0.3479, "rewards/accuracies": 0.0, "rewards/chosen": 2.656137228012085, "rewards/margins": 0.0, "rewards/rejected": 2.656137228012085, "step": 7336 }, { "epoch": 1.62, "learning_rate": 8.989922437132865e-07, "logits/chosen": -1.832748532295227, "logits/rejected": -1.8414044380187988, "logps/chosen": -53.682044982910156, "logps/rejected": -43.1121940612793, "loss": 0.9888, "rewards/accuracies": 0.0, "rewards/chosen": 2.462446689605713, "rewards/margins": -1.561774730682373, "rewards/rejected": 4.024221420288086, "step": 7337 }, { "epoch": 1.62, "learning_rate": 8.979671617467695e-07, "logits/chosen": -1.8313217163085938, "logits/rejected": -1.7908353805541992, "logps/chosen": -75.04478454589844, "logps/rejected": -91.916015625, "loss": 0.1053, "rewards/accuracies": 1.0, "rewards/chosen": 5.460119724273682, "rewards/margins": 1.6235625743865967, "rewards/rejected": 3.836557149887085, "step": 7338 }, { "epoch": 1.62, "learning_rate": 8.969426068816084e-07, "logits/chosen": -1.9964179992675781, "logits/rejected": -1.968997597694397, "logps/chosen": -101.20166015625, "logps/rejected": -143.68702697753906, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": 9.360885620117188, "rewards/margins": 2.6132383346557617, "rewards/rejected": 6.747647285461426, "step": 7339 }, { "epoch": 1.62, "learning_rate": 8.959185792494618e-07, "logits/chosen": -2.1172871589660645, "logits/rejected": -2.128892183303833, "logps/chosen": -38.196407318115234, "logps/rejected": -71.35237884521484, "loss": 0.2652, "rewards/accuracies": 1.0, "rewards/chosen": 4.041383743286133, "rewards/margins": 0.6781656742095947, "rewards/rejected": 3.363218069076538, "step": 7340 }, { "epoch": 1.62, "learning_rate": 8.9489507898191e-07, "logits/chosen": -1.9689005613327026, "logits/rejected": -1.911849021911621, "logps/chosen": -30.162460327148438, "logps/rejected": -189.98095703125, "loss": 1.7992, "rewards/accuracies": 0.0, "rewards/chosen": 5.25659704208374, "rewards/margins": -3.5382943153381348, "rewards/rejected": 8.794891357421875, "step": 7341 }, { "epoch": 1.63, "learning_rate": 8.938721062104727e-07, "logits/chosen": -1.6436272859573364, "logits/rejected": -1.5111825466156006, "logps/chosen": -85.22842407226562, "logps/rejected": -58.070247650146484, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 6.4440765380859375, "rewards/margins": 3.317728042602539, "rewards/rejected": 3.1263484954833984, "step": 7342 }, { "epoch": 1.63, "learning_rate": 8.928496610665982e-07, "logits/chosen": -2.2171175479888916, "logits/rejected": -2.223841667175293, "logps/chosen": -97.47128295898438, "logps/rejected": -140.75308227539062, "loss": 0.2119, "rewards/accuracies": 1.0, "rewards/chosen": 9.302477836608887, "rewards/margins": 0.644256591796875, "rewards/rejected": 8.658221244812012, "step": 7343 }, { "epoch": 1.63, "learning_rate": 8.918277436816697e-07, "logits/chosen": -1.9404107332229614, "logits/rejected": -1.9404107332229614, "logps/chosen": -36.75904083251953, "logps/rejected": -36.75904083251953, "loss": 0.3912, "rewards/accuracies": 0.0, "rewards/chosen": 2.792531728744507, "rewards/margins": 0.0, "rewards/rejected": 2.792531728744507, "step": 7344 }, { "epoch": 1.63, "learning_rate": 8.90806354187001e-07, "logits/chosen": -1.9132945537567139, "logits/rejected": -1.8633460998535156, "logps/chosen": -37.35902786254883, "logps/rejected": -15.095268249511719, "loss": 0.2171, "rewards/accuracies": 1.0, "rewards/chosen": 3.0977208614349365, "rewards/margins": 0.6343998908996582, "rewards/rejected": 2.4633209705352783, "step": 7345 }, { "epoch": 1.63, "learning_rate": 8.897854927138356e-07, "logits/chosen": -2.032238721847534, "logits/rejected": -2.0631113052368164, "logps/chosen": -60.99673843383789, "logps/rejected": -81.10699462890625, "loss": 0.1488, "rewards/accuracies": 1.0, "rewards/chosen": 8.438450813293457, "rewards/margins": 1.616563320159912, "rewards/rejected": 6.821887493133545, "step": 7346 }, { "epoch": 1.63, "learning_rate": 8.887651593933561e-07, "logits/chosen": -1.8514657020568848, "logits/rejected": -1.8729407787322998, "logps/chosen": -63.955177307128906, "logps/rejected": -170.75717163085938, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": 6.119365215301514, "rewards/margins": 4.096068382263184, "rewards/rejected": 2.023297071456909, "step": 7347 }, { "epoch": 1.63, "learning_rate": 8.877453543566678e-07, "logits/chosen": -1.6436681747436523, "logits/rejected": -1.6436681747436523, "logps/chosen": -45.868621826171875, "logps/rejected": -45.868621826171875, "loss": 0.4476, "rewards/accuracies": 0.0, "rewards/chosen": 3.4441163539886475, "rewards/margins": 0.0, "rewards/rejected": 3.4441163539886475, "step": 7348 }, { "epoch": 1.63, "learning_rate": 8.86726077734818e-07, "logits/chosen": -1.9990845918655396, "logits/rejected": -1.836158275604248, "logps/chosen": -84.36239624023438, "logps/rejected": -35.40039825439453, "loss": 0.1415, "rewards/accuracies": 1.0, "rewards/chosen": 7.285241603851318, "rewards/margins": 5.626507759094238, "rewards/rejected": 1.6587337255477905, "step": 7349 }, { "epoch": 1.63, "learning_rate": 8.857073296587781e-07, "logits/chosen": -2.1241118907928467, "logits/rejected": -2.131883382797241, "logps/chosen": -57.571502685546875, "logps/rejected": -91.08785247802734, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 6.747109889984131, "rewards/margins": 2.950951337814331, "rewards/rejected": 3.7961585521698, "step": 7350 }, { "epoch": 1.63, "learning_rate": 8.846891102594562e-07, "logits/chosen": -1.962207317352295, "logits/rejected": -2.0509798526763916, "logps/chosen": -25.766555786132812, "logps/rejected": -99.43133544921875, "loss": 2.3744, "rewards/accuracies": 0.0, "rewards/chosen": 5.136985778808594, "rewards/margins": -2.2657103538513184, "rewards/rejected": 7.402696132659912, "step": 7351 }, { "epoch": 1.63, "learning_rate": 8.83671419667691e-07, "logits/chosen": -1.9582477807998657, "logits/rejected": -1.9090402126312256, "logps/chosen": -35.964942932128906, "logps/rejected": -18.19097900390625, "loss": 0.17, "rewards/accuracies": 1.0, "rewards/chosen": 3.5721123218536377, "rewards/margins": 1.971875786781311, "rewards/rejected": 1.6002365350723267, "step": 7352 }, { "epoch": 1.63, "learning_rate": 8.826542580142522e-07, "logits/chosen": -2.0823135375976562, "logits/rejected": -2.0519635677337646, "logps/chosen": -34.670013427734375, "logps/rejected": -49.51188659667969, "loss": 0.0816, "rewards/accuracies": 1.0, "rewards/chosen": 4.4985527992248535, "rewards/margins": 1.7374718189239502, "rewards/rejected": 2.7610809803009033, "step": 7353 }, { "epoch": 1.63, "learning_rate": 8.816376254298437e-07, "logits/chosen": -1.7328217029571533, "logits/rejected": -1.8068197965621948, "logps/chosen": -21.189228057861328, "logps/rejected": -103.68898010253906, "loss": 0.9505, "rewards/accuracies": 0.0, "rewards/chosen": 4.3251423835754395, "rewards/margins": -1.7083158493041992, "rewards/rejected": 6.033458232879639, "step": 7354 }, { "epoch": 1.63, "learning_rate": 8.806215220451003e-07, "logits/chosen": -2.07800030708313, "logits/rejected": -2.1522481441497803, "logps/chosen": -30.1300048828125, "logps/rejected": -124.68321990966797, "loss": 1.2403, "rewards/accuracies": 0.0, "rewards/chosen": 4.825512886047363, "rewards/margins": -2.3386788368225098, "rewards/rejected": 7.164191722869873, "step": 7355 }, { "epoch": 1.63, "learning_rate": 8.7960594799059e-07, "logits/chosen": -2.28151535987854, "logits/rejected": -2.2727248668670654, "logps/chosen": -43.996299743652344, "logps/rejected": -68.8328857421875, "loss": 0.1309, "rewards/accuracies": 1.0, "rewards/chosen": 5.168739318847656, "rewards/margins": 1.522768259048462, "rewards/rejected": 3.6459710597991943, "step": 7356 }, { "epoch": 1.63, "learning_rate": 8.785909033968082e-07, "logits/chosen": -2.1280112266540527, "logits/rejected": -2.113651752471924, "logps/chosen": -83.07467651367188, "logps/rejected": -179.18777465820312, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 10.964335441589355, "rewards/margins": 3.479255199432373, "rewards/rejected": 7.485080242156982, "step": 7357 }, { "epoch": 1.63, "learning_rate": 8.775763883941885e-07, "logits/chosen": -1.9654401540756226, "logits/rejected": -1.9654401540756226, "logps/chosen": -38.65618133544922, "logps/rejected": -38.65618133544922, "loss": 0.3796, "rewards/accuracies": 0.0, "rewards/chosen": 4.840122222900391, "rewards/margins": 0.0, "rewards/rejected": 4.840122222900391, "step": 7358 }, { "epoch": 1.63, "learning_rate": 8.765624031130925e-07, "logits/chosen": -1.8031612634658813, "logits/rejected": -1.7602379322052002, "logps/chosen": -76.8639144897461, "logps/rejected": -63.651859283447266, "loss": 0.3456, "rewards/accuracies": 1.0, "rewards/chosen": 5.347970008850098, "rewards/margins": 3.2087972164154053, "rewards/rejected": 2.1391727924346924, "step": 7359 }, { "epoch": 1.63, "learning_rate": 8.755489476838152e-07, "logits/chosen": -1.6841809749603271, "logits/rejected": -1.6501930952072144, "logps/chosen": -33.601261138916016, "logps/rejected": -61.5455322265625, "loss": 0.3369, "rewards/accuracies": 1.0, "rewards/chosen": 4.61408805847168, "rewards/margins": 1.0651607513427734, "rewards/rejected": 3.5489273071289062, "step": 7360 }, { "epoch": 1.63, "learning_rate": 8.745360222365834e-07, "logits/chosen": -1.4065113067626953, "logits/rejected": -1.4065113067626953, "logps/chosen": -19.215251922607422, "logps/rejected": -19.215251922607422, "loss": 0.7454, "rewards/accuracies": 0.0, "rewards/chosen": 2.0811314582824707, "rewards/margins": 0.0, "rewards/rejected": 2.0811314582824707, "step": 7361 }, { "epoch": 1.63, "learning_rate": 8.735236269015551e-07, "logits/chosen": -2.10482120513916, "logits/rejected": -2.111173629760742, "logps/chosen": -36.18190383911133, "logps/rejected": -74.640869140625, "loss": 0.4774, "rewards/accuracies": 1.0, "rewards/chosen": 2.951021194458008, "rewards/margins": 0.9633487462997437, "rewards/rejected": 1.9876724481582642, "step": 7362 }, { "epoch": 1.63, "learning_rate": 8.725117618088214e-07, "logits/chosen": -1.8970658779144287, "logits/rejected": -1.8359142541885376, "logps/chosen": -118.90084838867188, "logps/rejected": -77.79971313476562, "loss": 0.7269, "rewards/accuracies": 0.0, "rewards/chosen": 7.653180122375488, "rewards/margins": -1.1812314987182617, "rewards/rejected": 8.83441162109375, "step": 7363 }, { "epoch": 1.63, "learning_rate": 8.715004270884037e-07, "logits/chosen": -1.8020249605178833, "logits/rejected": -1.7935097217559814, "logps/chosen": -123.68507385253906, "logps/rejected": -159.05743408203125, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": 10.189282417297363, "rewards/margins": 3.7549777030944824, "rewards/rejected": 6.434304714202881, "step": 7364 }, { "epoch": 1.63, "learning_rate": 8.704896228702581e-07, "logits/chosen": -1.9284822940826416, "logits/rejected": -1.8407357931137085, "logps/chosen": -36.881813049316406, "logps/rejected": -22.025108337402344, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": 4.286983013153076, "rewards/margins": 3.1268696784973145, "rewards/rejected": 1.1601132154464722, "step": 7365 }, { "epoch": 1.63, "learning_rate": 8.694793492842674e-07, "logits/chosen": -2.176185369491577, "logits/rejected": -2.054150104522705, "logps/chosen": -116.84832000732422, "logps/rejected": -21.656152725219727, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 6.751130104064941, "rewards/margins": 5.9200215339660645, "rewards/rejected": 0.8311085104942322, "step": 7366 }, { "epoch": 1.63, "learning_rate": 8.68469606460251e-07, "logits/chosen": -1.70145845413208, "logits/rejected": -1.7165117263793945, "logps/chosen": -43.59040069580078, "logps/rejected": -53.90803146362305, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 4.352248668670654, "rewards/margins": 1.462822675704956, "rewards/rejected": 2.8894259929656982, "step": 7367 }, { "epoch": 1.63, "learning_rate": 8.67460394527958e-07, "logits/chosen": -2.0019397735595703, "logits/rejected": -1.9891563653945923, "logps/chosen": -44.26114273071289, "logps/rejected": -47.14921188354492, "loss": 0.9792, "rewards/accuracies": 1.0, "rewards/chosen": 2.8426570892333984, "rewards/margins": 0.000213623046875, "rewards/rejected": 2.8424434661865234, "step": 7368 }, { "epoch": 1.63, "learning_rate": 8.664517136170702e-07, "logits/chosen": -1.9351308345794678, "logits/rejected": -1.842686653137207, "logps/chosen": -57.0105094909668, "logps/rejected": -16.559059143066406, "loss": 0.1319, "rewards/accuracies": 1.0, "rewards/chosen": 3.4277286529541016, "rewards/margins": 2.0369529724121094, "rewards/rejected": 1.3907756805419922, "step": 7369 }, { "epoch": 1.63, "learning_rate": 8.654435638572e-07, "logits/chosen": -2.0732946395874023, "logits/rejected": -2.0539934635162354, "logps/chosen": -58.8258171081543, "logps/rejected": -83.78610229492188, "loss": 0.3998, "rewards/accuracies": 1.0, "rewards/chosen": 5.183983325958252, "rewards/margins": 0.5293779373168945, "rewards/rejected": 4.654605388641357, "step": 7370 }, { "epoch": 1.63, "learning_rate": 8.644359453778934e-07, "logits/chosen": -1.7399884462356567, "logits/rejected": -1.7919096946716309, "logps/chosen": -69.25935363769531, "logps/rejected": -83.64096069335938, "loss": 1.0673, "rewards/accuracies": 0.0, "rewards/chosen": 5.9255571365356445, "rewards/margins": -1.951979160308838, "rewards/rejected": 7.877536296844482, "step": 7371 }, { "epoch": 1.63, "learning_rate": 8.634288583086253e-07, "logits/chosen": -1.7828445434570312, "logits/rejected": -1.7682204246520996, "logps/chosen": -40.97785568237305, "logps/rejected": -79.87979125976562, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": 2.676874876022339, "rewards/margins": 2.336516857147217, "rewards/rejected": 0.3403579890727997, "step": 7372 }, { "epoch": 1.63, "learning_rate": 8.624223027788053e-07, "logits/chosen": -1.978532314300537, "logits/rejected": -1.9645655155181885, "logps/chosen": -76.25096130371094, "logps/rejected": -134.27796936035156, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": 9.456148147583008, "rewards/margins": 3.4608888626098633, "rewards/rejected": 5.9952592849731445, "step": 7373 }, { "epoch": 1.63, "learning_rate": 8.614162789177732e-07, "logits/chosen": -1.7753489017486572, "logits/rejected": -1.6461044549942017, "logps/chosen": -53.74000549316406, "logps/rejected": -7.735487937927246, "loss": 0.0609, "rewards/accuracies": 1.0, "rewards/chosen": 3.512132406234741, "rewards/margins": 3.066200017929077, "rewards/rejected": 0.4459324777126312, "step": 7374 }, { "epoch": 1.63, "learning_rate": 8.604107868548012e-07, "logits/chosen": -1.9150737524032593, "logits/rejected": -1.7891759872436523, "logps/chosen": -68.17904663085938, "logps/rejected": -31.71810531616211, "loss": 0.2413, "rewards/accuracies": 1.0, "rewards/chosen": 0.9684066772460938, "rewards/margins": 0.49031561613082886, "rewards/rejected": 0.4780910611152649, "step": 7375 }, { "epoch": 1.63, "learning_rate": 8.594058267190908e-07, "logits/chosen": -1.8775634765625, "logits/rejected": -1.7549508810043335, "logps/chosen": -39.64105987548828, "logps/rejected": -34.845272064208984, "loss": 1.3211, "rewards/accuracies": 0.0, "rewards/chosen": 3.6076393127441406, "rewards/margins": -0.07616019248962402, "rewards/rejected": 3.6837995052337646, "step": 7376 }, { "epoch": 1.63, "learning_rate": 8.584013986397776e-07, "logits/chosen": -1.5854790210723877, "logits/rejected": -1.5854790210723877, "logps/chosen": -15.669709205627441, "logps/rejected": -15.669709205627441, "loss": 0.7189, "rewards/accuracies": 0.0, "rewards/chosen": 2.05077862739563, "rewards/margins": 0.0, "rewards/rejected": 2.05077862739563, "step": 7377 }, { "epoch": 1.63, "learning_rate": 8.573975027459292e-07, "logits/chosen": -1.8560349941253662, "logits/rejected": -1.7923624515533447, "logps/chosen": -87.4884262084961, "logps/rejected": -82.32257080078125, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": 7.744545936584473, "rewards/margins": 4.609701156616211, "rewards/rejected": 3.134845018386841, "step": 7378 }, { "epoch": 1.63, "learning_rate": 8.563941391665425e-07, "logits/chosen": -2.4355483055114746, "logits/rejected": -1.4783680438995361, "logps/chosen": -82.1300277709961, "logps/rejected": -113.74957275390625, "loss": 0.3225, "rewards/accuracies": 1.0, "rewards/chosen": 3.953472852706909, "rewards/margins": 0.7235243320465088, "rewards/rejected": 3.2299485206604004, "step": 7379 }, { "epoch": 1.63, "learning_rate": 8.553913080305482e-07, "logits/chosen": -1.9870095252990723, "logits/rejected": -1.8490369319915771, "logps/chosen": -124.44732666015625, "logps/rejected": -34.55328369140625, "loss": 0.0845, "rewards/accuracies": 1.0, "rewards/chosen": 4.85701322555542, "rewards/margins": 1.8754069805145264, "rewards/rejected": 2.9816062450408936, "step": 7380 }, { "epoch": 1.63, "learning_rate": 8.543890094668078e-07, "logits/chosen": -1.839355230331421, "logits/rejected": -1.765397071838379, "logps/chosen": -16.44240951538086, "logps/rejected": -14.066753387451172, "loss": 0.2673, "rewards/accuracies": 1.0, "rewards/chosen": 1.5345503091812134, "rewards/margins": 0.9752597808837891, "rewards/rejected": 0.5592905282974243, "step": 7381 }, { "epoch": 1.63, "learning_rate": 8.533872436041135e-07, "logits/chosen": -1.7808934450149536, "logits/rejected": -1.8415910005569458, "logps/chosen": -17.16797637939453, "logps/rejected": -106.61517333984375, "loss": 0.6311, "rewards/accuracies": 0.0, "rewards/chosen": 3.4013593196868896, "rewards/margins": -0.3963167667388916, "rewards/rejected": 3.7976760864257812, "step": 7382 }, { "epoch": 1.63, "learning_rate": 8.523860105711901e-07, "logits/chosen": -1.985994815826416, "logits/rejected": -1.9528812170028687, "logps/chosen": -33.47906494140625, "logps/rejected": -56.47979736328125, "loss": 0.2755, "rewards/accuracies": 1.0, "rewards/chosen": 3.1468758583068848, "rewards/margins": 0.46980977058410645, "rewards/rejected": 2.6770660877227783, "step": 7383 }, { "epoch": 1.63, "learning_rate": 8.513853104966951e-07, "logits/chosen": -1.6581982374191284, "logits/rejected": -1.4016066789627075, "logps/chosen": -53.11719512939453, "logps/rejected": -95.4097900390625, "loss": 0.8247, "rewards/accuracies": 0.0, "rewards/chosen": 3.2733230590820312, "rewards/margins": -1.416825771331787, "rewards/rejected": 4.690148830413818, "step": 7384 }, { "epoch": 1.63, "learning_rate": 8.503851435092125e-07, "logits/chosen": -1.8676676750183105, "logits/rejected": -1.8959029912948608, "logps/chosen": -37.556434631347656, "logps/rejected": -69.45254516601562, "loss": 0.2228, "rewards/accuracies": 1.0, "rewards/chosen": 4.124713897705078, "rewards/margins": 0.6979293823242188, "rewards/rejected": 3.4267845153808594, "step": 7385 }, { "epoch": 1.63, "learning_rate": 8.493855097372661e-07, "logits/chosen": -1.8585916757583618, "logits/rejected": -1.7824902534484863, "logps/chosen": -41.800323486328125, "logps/rejected": -48.88301467895508, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": 4.414760589599609, "rewards/margins": 3.4340980052948, "rewards/rejected": 0.9806625247001648, "step": 7386 }, { "epoch": 1.64, "learning_rate": 8.48386409309302e-07, "logits/chosen": -1.8645573854446411, "logits/rejected": -1.9225517511367798, "logps/chosen": -106.70899963378906, "logps/rejected": -107.90892028808594, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 9.488249778747559, "rewards/margins": 4.344267845153809, "rewards/rejected": 5.14398193359375, "step": 7387 }, { "epoch": 1.64, "learning_rate": 8.473878423537046e-07, "logits/chosen": -2.0533618927001953, "logits/rejected": -1.9757078886032104, "logps/chosen": -99.63544464111328, "logps/rejected": -66.56166076660156, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 7.402109622955322, "rewards/margins": 3.5780441761016846, "rewards/rejected": 3.8240654468536377, "step": 7388 }, { "epoch": 1.64, "learning_rate": 8.463898089987871e-07, "logits/chosen": -1.9441654682159424, "logits/rejected": -1.8745882511138916, "logps/chosen": -138.1956024169922, "logps/rejected": -87.99659729003906, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 7.557260036468506, "rewards/margins": 3.4793272018432617, "rewards/rejected": 4.077932834625244, "step": 7389 }, { "epoch": 1.64, "learning_rate": 8.453923093727933e-07, "logits/chosen": -1.8693418502807617, "logits/rejected": -1.8679569959640503, "logps/chosen": -62.45414733886719, "logps/rejected": -135.97647094726562, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": 7.391231060028076, "rewards/margins": 2.6499528884887695, "rewards/rejected": 4.741278171539307, "step": 7390 }, { "epoch": 1.64, "learning_rate": 8.44395343603901e-07, "logits/chosen": -2.115035057067871, "logits/rejected": -2.1015520095825195, "logps/chosen": -71.80487060546875, "logps/rejected": -97.87785339355469, "loss": 1.0946, "rewards/accuracies": 0.0, "rewards/chosen": 8.667838096618652, "rewards/margins": -2.0662364959716797, "rewards/rejected": 10.734074592590332, "step": 7391 }, { "epoch": 1.64, "learning_rate": 8.433989118202174e-07, "logits/chosen": -1.7865580320358276, "logits/rejected": -1.8192209005355835, "logps/chosen": -35.812129974365234, "logps/rejected": -112.55607604980469, "loss": 1.2401, "rewards/accuracies": 0.0, "rewards/chosen": 5.2731146812438965, "rewards/margins": -2.386223793029785, "rewards/rejected": 7.659338474273682, "step": 7392 }, { "epoch": 1.64, "learning_rate": 8.424030141497824e-07, "logits/chosen": -2.03619384765625, "logits/rejected": -1.8109099864959717, "logps/chosen": -116.63131713867188, "logps/rejected": -33.274837493896484, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 5.275964260101318, "rewards/margins": 4.8647332191467285, "rewards/rejected": 0.41123124957084656, "step": 7393 }, { "epoch": 1.64, "learning_rate": 8.414076507205626e-07, "logits/chosen": -2.0228097438812256, "logits/rejected": -2.0355942249298096, "logps/chosen": -41.634033203125, "logps/rejected": -95.17431640625, "loss": 0.0744, "rewards/accuracies": 1.0, "rewards/chosen": 2.1312851905822754, "rewards/margins": 1.9716897010803223, "rewards/rejected": 0.15959548950195312, "step": 7394 }, { "epoch": 1.64, "learning_rate": 8.404128216604656e-07, "logits/chosen": -1.659010410308838, "logits/rejected": -1.532213568687439, "logps/chosen": -126.38233947753906, "logps/rejected": -93.58786010742188, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 6.1602067947387695, "rewards/margins": 3.8261492252349854, "rewards/rejected": 2.334057569503784, "step": 7395 }, { "epoch": 1.64, "learning_rate": 8.394185270973204e-07, "logits/chosen": -1.5619866847991943, "logits/rejected": -1.6294766664505005, "logps/chosen": -21.109949111938477, "logps/rejected": -45.087890625, "loss": 0.8785, "rewards/accuracies": 0.0, "rewards/chosen": 2.4883840084075928, "rewards/margins": -1.514465570449829, "rewards/rejected": 4.002849578857422, "step": 7396 }, { "epoch": 1.64, "learning_rate": 8.384247671588929e-07, "logits/chosen": -1.9111560583114624, "logits/rejected": -1.84663724899292, "logps/chosen": -51.107845306396484, "logps/rejected": -20.244873046875, "loss": 0.3494, "rewards/accuracies": 1.0, "rewards/chosen": 2.2238948345184326, "rewards/margins": 0.3107055425643921, "rewards/rejected": 1.9131892919540405, "step": 7397 }, { "epoch": 1.64, "learning_rate": 8.374315419728784e-07, "logits/chosen": -1.7952690124511719, "logits/rejected": -1.7829755544662476, "logps/chosen": -31.140073776245117, "logps/rejected": -41.785316467285156, "loss": 0.5733, "rewards/accuracies": 0.0, "rewards/chosen": 4.427234172821045, "rewards/margins": -0.6043338775634766, "rewards/rejected": 5.0315680503845215, "step": 7398 }, { "epoch": 1.64, "learning_rate": 8.364388516669041e-07, "logits/chosen": -1.7598564624786377, "logits/rejected": -1.7976192235946655, "logps/chosen": -35.040748596191406, "logps/rejected": -38.062740325927734, "loss": 0.5192, "rewards/accuracies": 1.0, "rewards/chosen": 4.620538234710693, "rewards/margins": 0.2339935302734375, "rewards/rejected": 4.386544704437256, "step": 7399 }, { "epoch": 1.64, "learning_rate": 8.354466963685287e-07, "logits/chosen": -2.2064311504364014, "logits/rejected": -1.7571483850479126, "logps/chosen": -93.0718994140625, "logps/rejected": -48.57899475097656, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": 9.108197212219238, "rewards/margins": 6.441061019897461, "rewards/rejected": 2.6671364307403564, "step": 7400 }, { "epoch": 1.64, "learning_rate": 8.344550762052417e-07, "logits/chosen": -1.963392734527588, "logits/rejected": -1.9307332038879395, "logps/chosen": -42.2197380065918, "logps/rejected": -39.17571258544922, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": 3.7511098384857178, "rewards/margins": 2.782447576522827, "rewards/rejected": 0.9686622619628906, "step": 7401 }, { "epoch": 1.64, "learning_rate": 8.334639913044646e-07, "logits/chosen": -1.9651247262954712, "logits/rejected": -1.874595284461975, "logps/chosen": -49.861183166503906, "logps/rejected": -128.55990600585938, "loss": 1.3352, "rewards/accuracies": 0.0, "rewards/chosen": 5.881655216217041, "rewards/margins": -2.591370105743408, "rewards/rejected": 8.47302532196045, "step": 7402 }, { "epoch": 1.64, "learning_rate": 8.32473441793546e-07, "logits/chosen": -1.889176607131958, "logits/rejected": -1.884735107421875, "logps/chosen": -40.177490234375, "logps/rejected": -73.93317413330078, "loss": 0.1264, "rewards/accuracies": 1.0, "rewards/chosen": 3.242213487625122, "rewards/margins": 1.7348060607910156, "rewards/rejected": 1.5074074268341064, "step": 7403 }, { "epoch": 1.64, "learning_rate": 8.314834277997747e-07, "logits/chosen": -2.0065579414367676, "logits/rejected": -2.0131173133850098, "logps/chosen": -59.05073928833008, "logps/rejected": -148.27235412597656, "loss": 0.4947, "rewards/accuracies": 0.0, "rewards/chosen": 9.129020690917969, "rewards/margins": -0.51513671875, "rewards/rejected": 9.644157409667969, "step": 7404 }, { "epoch": 1.64, "learning_rate": 8.304939494503588e-07, "logits/chosen": -2.155395269393921, "logits/rejected": -2.0922160148620605, "logps/chosen": -148.89674377441406, "logps/rejected": -76.92268371582031, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 8.554420471191406, "rewards/margins": 6.058862209320068, "rewards/rejected": 2.495558261871338, "step": 7405 }, { "epoch": 1.64, "learning_rate": 8.295050068724503e-07, "logits/chosen": -2.104673385620117, "logits/rejected": -2.072942018508911, "logps/chosen": -69.73805236816406, "logps/rejected": -60.105682373046875, "loss": 0.3904, "rewards/accuracies": 0.0, "rewards/chosen": 4.453782558441162, "rewards/margins": -0.1435227394104004, "rewards/rejected": 4.5973052978515625, "step": 7406 }, { "epoch": 1.64, "learning_rate": 8.28516600193121e-07, "logits/chosen": -2.0763301849365234, "logits/rejected": -1.9847908020019531, "logps/chosen": -134.63552856445312, "logps/rejected": -64.99346923828125, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 6.5489349365234375, "rewards/margins": 3.991877794265747, "rewards/rejected": 2.5570571422576904, "step": 7407 }, { "epoch": 1.64, "learning_rate": 8.275287295393813e-07, "logits/chosen": -1.7012665271759033, "logits/rejected": -1.6760013103485107, "logps/chosen": -50.441680908203125, "logps/rejected": -56.018150329589844, "loss": 0.1389, "rewards/accuracies": 1.0, "rewards/chosen": 3.277658224105835, "rewards/margins": 1.315712809562683, "rewards/rejected": 1.9619454145431519, "step": 7408 }, { "epoch": 1.64, "learning_rate": 8.265413950381707e-07, "logits/chosen": -1.930232048034668, "logits/rejected": -1.9347763061523438, "logps/chosen": -44.28916931152344, "logps/rejected": -125.1141128540039, "loss": 1.8278, "rewards/accuracies": 0.0, "rewards/chosen": 5.511788845062256, "rewards/margins": -2.5666441917419434, "rewards/rejected": 8.0784330368042, "step": 7409 }, { "epoch": 1.64, "learning_rate": 8.255545968163553e-07, "logits/chosen": -1.8575078248977661, "logits/rejected": -1.7875940799713135, "logps/chosen": -111.22027587890625, "logps/rejected": -117.16049194335938, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 10.258478164672852, "rewards/margins": 2.6838135719299316, "rewards/rejected": 7.57466459274292, "step": 7410 }, { "epoch": 1.64, "learning_rate": 8.245683350007416e-07, "logits/chosen": -1.8455243110656738, "logits/rejected": -1.8292125463485718, "logps/chosen": -33.4886589050293, "logps/rejected": -34.655296325683594, "loss": 0.8702, "rewards/accuracies": 0.0, "rewards/chosen": 2.6719729900360107, "rewards/margins": -1.293074607849121, "rewards/rejected": 3.965047597885132, "step": 7411 }, { "epoch": 1.64, "learning_rate": 8.235826097180566e-07, "logits/chosen": -1.7516525983810425, "logits/rejected": -1.7381221055984497, "logps/chosen": -59.571842193603516, "logps/rejected": -48.79361343383789, "loss": 0.3651, "rewards/accuracies": 1.0, "rewards/chosen": 4.624584674835205, "rewards/margins": 0.11877822875976562, "rewards/rejected": 4.5058064460754395, "step": 7412 }, { "epoch": 1.64, "learning_rate": 8.225974210949695e-07, "logits/chosen": -1.9240806102752686, "logits/rejected": -1.907874345779419, "logps/chosen": -66.45208740234375, "logps/rejected": -56.1379280090332, "loss": 0.2044, "rewards/accuracies": 1.0, "rewards/chosen": 6.019847869873047, "rewards/margins": 0.686159610748291, "rewards/rejected": 5.333688259124756, "step": 7413 }, { "epoch": 1.64, "learning_rate": 8.216127692580689e-07, "logits/chosen": -1.949859857559204, "logits/rejected": -1.8599852323532104, "logps/chosen": -121.86263275146484, "logps/rejected": -104.22686767578125, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 7.1696600914001465, "rewards/margins": 3.172264814376831, "rewards/rejected": 3.9973952770233154, "step": 7414 }, { "epoch": 1.64, "learning_rate": 8.206286543338854e-07, "logits/chosen": -1.802857518196106, "logits/rejected": -1.8091779947280884, "logps/chosen": -40.52723693847656, "logps/rejected": -35.15509033203125, "loss": 0.6028, "rewards/accuracies": 0.0, "rewards/chosen": 3.3140251636505127, "rewards/margins": -0.2967352867126465, "rewards/rejected": 3.610760450363159, "step": 7415 }, { "epoch": 1.64, "learning_rate": 8.196450764488717e-07, "logits/chosen": -1.9126015901565552, "logits/rejected": -1.8783973455429077, "logps/chosen": -59.38728332519531, "logps/rejected": -72.30116271972656, "loss": 0.2636, "rewards/accuracies": 1.0, "rewards/chosen": 4.007625102996826, "rewards/margins": 0.5013377666473389, "rewards/rejected": 3.5062873363494873, "step": 7416 }, { "epoch": 1.64, "learning_rate": 8.186620357294162e-07, "logits/chosen": -1.941758155822754, "logits/rejected": -1.942636489868164, "logps/chosen": -29.931306838989258, "logps/rejected": -74.16246795654297, "loss": 0.698, "rewards/accuracies": 0.0, "rewards/chosen": 3.741497039794922, "rewards/margins": -1.051286220550537, "rewards/rejected": 4.792783260345459, "step": 7417 }, { "epoch": 1.64, "learning_rate": 8.176795323018388e-07, "logits/chosen": -1.7764101028442383, "logits/rejected": -1.7961176633834839, "logps/chosen": -50.99732971191406, "logps/rejected": -30.250818252563477, "loss": 0.467, "rewards/accuracies": 0.0, "rewards/chosen": 3.8280885219573975, "rewards/margins": -0.3575723171234131, "rewards/rejected": 4.1856608390808105, "step": 7418 }, { "epoch": 1.64, "learning_rate": 8.166975662923849e-07, "logits/chosen": -2.109149694442749, "logits/rejected": -2.0742170810699463, "logps/chosen": -104.68637084960938, "logps/rejected": -115.21498107910156, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": 8.424086570739746, "rewards/margins": 2.984123706817627, "rewards/rejected": 5.439962863922119, "step": 7419 }, { "epoch": 1.64, "learning_rate": 8.157161378272399e-07, "logits/chosen": -2.0614125728607178, "logits/rejected": -2.064586639404297, "logps/chosen": -45.89974594116211, "logps/rejected": -72.26154327392578, "loss": 0.522, "rewards/accuracies": 0.0, "rewards/chosen": 4.9374518394470215, "rewards/margins": -0.6023406982421875, "rewards/rejected": 5.539792537689209, "step": 7420 }, { "epoch": 1.64, "learning_rate": 8.147352470325098e-07, "logits/chosen": -2.163753032684326, "logits/rejected": -2.075364828109741, "logps/chosen": -89.77824401855469, "logps/rejected": -25.065044403076172, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": 3.887469530105591, "rewards/margins": 3.1715304851531982, "rewards/rejected": 0.7159389853477478, "step": 7421 }, { "epoch": 1.64, "learning_rate": 8.137548940342422e-07, "logits/chosen": -2.031263589859009, "logits/rejected": -2.056196451187134, "logps/chosen": -56.02525329589844, "logps/rejected": -82.98218536376953, "loss": 0.1106, "rewards/accuracies": 1.0, "rewards/chosen": 6.698108196258545, "rewards/margins": 1.6661601066589355, "rewards/rejected": 5.031948089599609, "step": 7422 }, { "epoch": 1.64, "learning_rate": 8.127750789584055e-07, "logits/chosen": -1.8950731754302979, "logits/rejected": -1.6717326641082764, "logps/chosen": -41.227081298828125, "logps/rejected": -36.07676696777344, "loss": 0.8451, "rewards/accuracies": 0.0, "rewards/chosen": 4.867344856262207, "rewards/margins": -1.4714064598083496, "rewards/rejected": 6.338751316070557, "step": 7423 }, { "epoch": 1.64, "learning_rate": 8.117958019309057e-07, "logits/chosen": -2.0778684616088867, "logits/rejected": -2.101799249649048, "logps/chosen": -66.04132080078125, "logps/rejected": -41.37346649169922, "loss": 0.6511, "rewards/accuracies": 1.0, "rewards/chosen": 4.9553399085998535, "rewards/margins": 1.0941483974456787, "rewards/rejected": 3.861191511154175, "step": 7424 }, { "epoch": 1.64, "learning_rate": 8.108170630775774e-07, "logits/chosen": -2.3166253566741943, "logits/rejected": -2.3300068378448486, "logps/chosen": -44.106788635253906, "logps/rejected": -67.43666076660156, "loss": 0.3663, "rewards/accuracies": 1.0, "rewards/chosen": 3.6916306018829346, "rewards/margins": 0.6112303733825684, "rewards/rejected": 3.080400228500366, "step": 7425 }, { "epoch": 1.64, "learning_rate": 8.098388625241854e-07, "logits/chosen": -2.0545666217803955, "logits/rejected": -1.9962769746780396, "logps/chosen": -41.766090393066406, "logps/rejected": -30.71905517578125, "loss": 0.2612, "rewards/accuracies": 1.0, "rewards/chosen": 4.489965915679932, "rewards/margins": 3.341573715209961, "rewards/rejected": 1.1483920812606812, "step": 7426 }, { "epoch": 1.64, "learning_rate": 8.088612003964285e-07, "logits/chosen": -1.7762713432312012, "logits/rejected": -1.7762713432312012, "logps/chosen": -65.03340148925781, "logps/rejected": -65.03340148925781, "loss": 0.5507, "rewards/accuracies": 0.0, "rewards/chosen": 2.596259355545044, "rewards/margins": 0.0, "rewards/rejected": 2.596259355545044, "step": 7427 }, { "epoch": 1.64, "learning_rate": 8.078840768199292e-07, "logits/chosen": -1.7741838693618774, "logits/rejected": -1.7242887020111084, "logps/chosen": -39.99136734008789, "logps/rejected": -66.85173797607422, "loss": 0.0798, "rewards/accuracies": 1.0, "rewards/chosen": 3.6313228607177734, "rewards/margins": 2.076388120651245, "rewards/rejected": 1.5549347400665283, "step": 7428 }, { "epoch": 1.64, "learning_rate": 8.069074919202513e-07, "logits/chosen": -1.8571367263793945, "logits/rejected": -1.8162215948104858, "logps/chosen": -26.7041015625, "logps/rejected": -16.489177703857422, "loss": 0.7469, "rewards/accuracies": 0.0, "rewards/chosen": 2.629962205886841, "rewards/margins": -0.22903966903686523, "rewards/rejected": 2.859001874923706, "step": 7429 }, { "epoch": 1.64, "learning_rate": 8.059314458228784e-07, "logits/chosen": -2.21938419342041, "logits/rejected": -2.2166616916656494, "logps/chosen": -29.12295913696289, "logps/rejected": -71.04438781738281, "loss": 0.2922, "rewards/accuracies": 1.0, "rewards/chosen": 3.7678768634796143, "rewards/margins": 0.2473461627960205, "rewards/rejected": 3.5205307006835938, "step": 7430 }, { "epoch": 1.64, "learning_rate": 8.049559386532346e-07, "logits/chosen": -2.2732343673706055, "logits/rejected": -2.278756856918335, "logps/chosen": -2.8943533897399902, "logps/rejected": -58.48523712158203, "loss": 0.5354, "rewards/accuracies": 1.0, "rewards/chosen": 1.0258893966674805, "rewards/margins": 0.09423959255218506, "rewards/rejected": 0.9316498041152954, "step": 7431 }, { "epoch": 1.64, "learning_rate": 8.039809705366669e-07, "logits/chosen": -1.9735437631607056, "logits/rejected": -1.8009552955627441, "logps/chosen": -58.048919677734375, "logps/rejected": -37.066829681396484, "loss": 0.1618, "rewards/accuracies": 1.0, "rewards/chosen": 2.859691619873047, "rewards/margins": 1.553803563117981, "rewards/rejected": 1.305888056755066, "step": 7432 }, { "epoch": 1.65, "learning_rate": 8.030065415984572e-07, "logits/chosen": -1.5910474061965942, "logits/rejected": -1.6327226161956787, "logps/chosen": -31.528278350830078, "logps/rejected": -64.3084716796875, "loss": 0.5411, "rewards/accuracies": 0.0, "rewards/chosen": 3.2335705757141113, "rewards/margins": -0.2905566692352295, "rewards/rejected": 3.524127244949341, "step": 7433 }, { "epoch": 1.65, "learning_rate": 8.020326519638183e-07, "logits/chosen": -2.066560745239258, "logits/rejected": -2.066560745239258, "logps/chosen": -56.84668731689453, "logps/rejected": -56.84668731689453, "loss": 0.3496, "rewards/accuracies": 0.0, "rewards/chosen": 4.026883125305176, "rewards/margins": 0.0, "rewards/rejected": 4.026883125305176, "step": 7434 }, { "epoch": 1.65, "learning_rate": 8.010593017578916e-07, "logits/chosen": -2.1510283946990967, "logits/rejected": -1.9398524761199951, "logps/chosen": -93.4120101928711, "logps/rejected": -123.34138488769531, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 16.68505859375, "rewards/margins": 5.349761962890625, "rewards/rejected": 11.335296630859375, "step": 7435 }, { "epoch": 1.65, "learning_rate": 8.000864911057505e-07, "logits/chosen": -1.8338556289672852, "logits/rejected": -1.7812132835388184, "logps/chosen": -14.467549324035645, "logps/rejected": -1.9623658657073975, "loss": 0.177, "rewards/accuracies": 1.0, "rewards/chosen": 1.5995806455612183, "rewards/margins": 0.8606154322624207, "rewards/rejected": 0.7389652132987976, "step": 7436 }, { "epoch": 1.65, "learning_rate": 7.991142201323992e-07, "logits/chosen": -2.1289689540863037, "logits/rejected": -2.1088476181030273, "logps/chosen": -43.830238342285156, "logps/rejected": -72.70235443115234, "loss": 0.206, "rewards/accuracies": 1.0, "rewards/chosen": 2.8548552989959717, "rewards/margins": 2.6429312229156494, "rewards/rejected": 0.2119239866733551, "step": 7437 }, { "epoch": 1.65, "learning_rate": 7.981424889627731e-07, "logits/chosen": -2.321263074874878, "logits/rejected": -2.3275606632232666, "logps/chosen": -49.14898681640625, "logps/rejected": -66.9688720703125, "loss": 0.1454, "rewards/accuracies": 1.0, "rewards/chosen": 4.044381618499756, "rewards/margins": 1.0895705223083496, "rewards/rejected": 2.9548110961914062, "step": 7438 }, { "epoch": 1.65, "learning_rate": 7.971712977217338e-07, "logits/chosen": -1.912681221961975, "logits/rejected": -1.9709583520889282, "logps/chosen": -123.7496109008789, "logps/rejected": -124.7986068725586, "loss": 0.7979, "rewards/accuracies": 0.0, "rewards/chosen": 7.3080315589904785, "rewards/margins": -0.3000197410583496, "rewards/rejected": 7.608051300048828, "step": 7439 }, { "epoch": 1.65, "learning_rate": 7.962006465340821e-07, "logits/chosen": -1.938599944114685, "logits/rejected": -1.8851500749588013, "logps/chosen": -127.02884674072266, "logps/rejected": -25.928640365600586, "loss": 0.1132, "rewards/accuracies": 1.0, "rewards/chosen": 7.006520748138428, "rewards/margins": 1.7857518196105957, "rewards/rejected": 5.220768928527832, "step": 7440 }, { "epoch": 1.65, "learning_rate": 7.952305355245404e-07, "logits/chosen": -2.054234027862549, "logits/rejected": -2.0676138401031494, "logps/chosen": -39.92274475097656, "logps/rejected": -39.594573974609375, "loss": 0.1988, "rewards/accuracies": 1.0, "rewards/chosen": 3.8263168334960938, "rewards/margins": 0.7307066917419434, "rewards/rejected": 3.0956101417541504, "step": 7441 }, { "epoch": 1.65, "learning_rate": 7.942609648177663e-07, "logits/chosen": -2.0218453407287598, "logits/rejected": -1.882301688194275, "logps/chosen": -113.15361785888672, "logps/rejected": -49.70729064941406, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": 5.875800609588623, "rewards/margins": 1.9948077201843262, "rewards/rejected": 3.880992889404297, "step": 7442 }, { "epoch": 1.65, "learning_rate": 7.932919345383483e-07, "logits/chosen": -1.8281503915786743, "logits/rejected": -1.827345609664917, "logps/chosen": -55.93856430053711, "logps/rejected": -55.56370544433594, "loss": 0.0642, "rewards/accuracies": 1.0, "rewards/chosen": 5.163562297821045, "rewards/margins": 1.9879977703094482, "rewards/rejected": 3.1755645275115967, "step": 7443 }, { "epoch": 1.65, "learning_rate": 7.923234448108041e-07, "logits/chosen": -2.0651652812957764, "logits/rejected": -2.082279920578003, "logps/chosen": -12.630758285522461, "logps/rejected": -44.973968505859375, "loss": 0.9767, "rewards/accuracies": 0.0, "rewards/chosen": 1.949644684791565, "rewards/margins": -1.7010427713394165, "rewards/rejected": 3.6506874561309814, "step": 7444 }, { "epoch": 1.65, "learning_rate": 7.913554957595826e-07, "logits/chosen": -1.7607983350753784, "logits/rejected": -1.7398067712783813, "logps/chosen": -59.390357971191406, "logps/rejected": -58.74449920654297, "loss": 0.5195, "rewards/accuracies": 1.0, "rewards/chosen": 2.36506724357605, "rewards/margins": 0.8839919567108154, "rewards/rejected": 1.4810752868652344, "step": 7445 }, { "epoch": 1.65, "learning_rate": 7.903880875090619e-07, "logits/chosen": -1.6310070753097534, "logits/rejected": -1.5198088884353638, "logps/chosen": -39.571189880371094, "logps/rejected": -5.611769199371338, "loss": 0.147, "rewards/accuracies": 1.0, "rewards/chosen": 2.7935402393341064, "rewards/margins": 1.712988257408142, "rewards/rejected": 1.0805519819259644, "step": 7446 }, { "epoch": 1.65, "learning_rate": 7.894212201835539e-07, "logits/chosen": -2.415024995803833, "logits/rejected": -2.3918471336364746, "logps/chosen": -38.07710266113281, "logps/rejected": -19.000171661376953, "loss": 2.5381, "rewards/accuracies": 1.0, "rewards/chosen": 2.7008864879608154, "rewards/margins": 1.829451322555542, "rewards/rejected": 0.8714351654052734, "step": 7447 }, { "epoch": 1.65, "learning_rate": 7.88454893907295e-07, "logits/chosen": -1.9878380298614502, "logits/rejected": -1.8572720289230347, "logps/chosen": -75.65304565429688, "logps/rejected": -202.1912841796875, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": 8.487597465515137, "rewards/margins": 3.291293144226074, "rewards/rejected": 5.1963043212890625, "step": 7448 }, { "epoch": 1.65, "learning_rate": 7.874891088044601e-07, "logits/chosen": -1.647477149963379, "logits/rejected": -1.5531446933746338, "logps/chosen": -137.42298889160156, "logps/rejected": -72.85151672363281, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": 6.978129863739014, "rewards/margins": 3.2332186698913574, "rewards/rejected": 3.7449111938476562, "step": 7449 }, { "epoch": 1.65, "learning_rate": 7.865238649991469e-07, "logits/chosen": -1.8325214385986328, "logits/rejected": -1.8327887058258057, "logps/chosen": -40.714996337890625, "logps/rejected": -53.50368881225586, "loss": 0.1301, "rewards/accuracies": 1.0, "rewards/chosen": 4.402294158935547, "rewards/margins": 1.2583339214324951, "rewards/rejected": 3.1439602375030518, "step": 7450 }, { "epoch": 1.65, "learning_rate": 7.85559162615388e-07, "logits/chosen": -1.8155344724655151, "logits/rejected": -1.8155344724655151, "logps/chosen": -68.91751861572266, "logps/rejected": -68.91751861572266, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 3.6985023021698, "rewards/margins": 0.0, "rewards/rejected": 3.6985023021698, "step": 7451 }, { "epoch": 1.65, "learning_rate": 7.845950017771459e-07, "logits/chosen": -1.7341127395629883, "logits/rejected": -1.8032430410385132, "logps/chosen": -41.48783493041992, "logps/rejected": -158.6507110595703, "loss": 0.2701, "rewards/accuracies": 1.0, "rewards/chosen": 6.318930625915527, "rewards/margins": 2.090050220489502, "rewards/rejected": 4.228880405426025, "step": 7452 }, { "epoch": 1.65, "learning_rate": 7.83631382608312e-07, "logits/chosen": -1.711431622505188, "logits/rejected": -1.6710132360458374, "logps/chosen": -30.268264770507812, "logps/rejected": -24.75537109375, "loss": 0.1483, "rewards/accuracies": 1.0, "rewards/chosen": 3.524204969406128, "rewards/margins": 1.6535929441452026, "rewards/rejected": 1.8706120252609253, "step": 7453 }, { "epoch": 1.65, "learning_rate": 7.8266830523271e-07, "logits/chosen": -2.1004207134246826, "logits/rejected": -2.139557361602783, "logps/chosen": -54.534423828125, "logps/rejected": -57.573360443115234, "loss": 2.2431, "rewards/accuracies": 0.0, "rewards/chosen": 3.72723388671875, "rewards/margins": -4.403024673461914, "rewards/rejected": 8.130258560180664, "step": 7454 }, { "epoch": 1.65, "learning_rate": 7.817057697740921e-07, "logits/chosen": -2.011556386947632, "logits/rejected": -1.8284080028533936, "logps/chosen": -96.02532196044922, "logps/rejected": -14.133806228637695, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": 9.421509742736816, "rewards/margins": 7.9190144538879395, "rewards/rejected": 1.5024954080581665, "step": 7455 }, { "epoch": 1.65, "learning_rate": 7.807437763561426e-07, "logits/chosen": -1.6965548992156982, "logits/rejected": -1.6734739542007446, "logps/chosen": -38.542537689208984, "logps/rejected": -40.83165740966797, "loss": 0.6324, "rewards/accuracies": 1.0, "rewards/chosen": 1.9353077411651611, "rewards/margins": 0.24519240856170654, "rewards/rejected": 1.6901153326034546, "step": 7456 }, { "epoch": 1.65, "learning_rate": 7.79782325102475e-07, "logits/chosen": -1.8947407007217407, "logits/rejected": -1.8302457332611084, "logps/chosen": -25.832717895507812, "logps/rejected": -75.78692626953125, "loss": 0.995, "rewards/accuracies": 0.0, "rewards/chosen": 4.772458076477051, "rewards/margins": -1.8415923118591309, "rewards/rejected": 6.614050388336182, "step": 7457 }, { "epoch": 1.65, "learning_rate": 7.788214161366347e-07, "logits/chosen": -1.511134386062622, "logits/rejected": -1.511134386062622, "logps/chosen": -39.59022521972656, "logps/rejected": -39.59022521972656, "loss": 0.3957, "rewards/accuracies": 0.0, "rewards/chosen": 2.7031776905059814, "rewards/margins": 0.0, "rewards/rejected": 2.7031776905059814, "step": 7458 }, { "epoch": 1.65, "learning_rate": 7.778610495820937e-07, "logits/chosen": -1.5694929361343384, "logits/rejected": -1.4050798416137695, "logps/chosen": -40.144798278808594, "logps/rejected": -10.989142417907715, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 3.3242623805999756, "rewards/margins": 2.626099109649658, "rewards/rejected": 0.6981633305549622, "step": 7459 }, { "epoch": 1.65, "learning_rate": 7.769012255622583e-07, "logits/chosen": -1.829334020614624, "logits/rejected": -1.6917333602905273, "logps/chosen": -32.48289108276367, "logps/rejected": -70.10560607910156, "loss": 0.3706, "rewards/accuracies": 1.0, "rewards/chosen": 3.1672322750091553, "rewards/margins": 0.5460386276245117, "rewards/rejected": 2.6211936473846436, "step": 7460 }, { "epoch": 1.65, "learning_rate": 7.759419442004634e-07, "logits/chosen": -1.6291062831878662, "logits/rejected": -1.454727292060852, "logps/chosen": -51.83612060546875, "logps/rejected": -20.113452911376953, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": 2.8821334838867188, "rewards/margins": 2.9524712562561035, "rewards/rejected": -0.070337675511837, "step": 7461 }, { "epoch": 1.65, "learning_rate": 7.749832056199741e-07, "logits/chosen": -1.805958867073059, "logits/rejected": -1.7760810852050781, "logps/chosen": -42.128196716308594, "logps/rejected": -60.98863220214844, "loss": 0.1461, "rewards/accuracies": 1.0, "rewards/chosen": 3.646075487136841, "rewards/margins": 1.0817596912384033, "rewards/rejected": 2.5643157958984375, "step": 7462 }, { "epoch": 1.65, "learning_rate": 7.740250099439861e-07, "logits/chosen": -1.876081943511963, "logits/rejected": -1.8455158472061157, "logps/chosen": -121.88822937011719, "logps/rejected": -29.15218734741211, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": 6.716050624847412, "rewards/margins": 3.533064842224121, "rewards/rejected": 3.182985782623291, "step": 7463 }, { "epoch": 1.65, "learning_rate": 7.730673572956254e-07, "logits/chosen": -1.7942674160003662, "logits/rejected": -1.7942674160003662, "logps/chosen": -14.963332176208496, "logps/rejected": -14.963332176208496, "loss": 0.592, "rewards/accuracies": 0.0, "rewards/chosen": 2.6081836223602295, "rewards/margins": 0.0, "rewards/rejected": 2.6081836223602295, "step": 7464 }, { "epoch": 1.65, "learning_rate": 7.721102477979487e-07, "logits/chosen": -1.7698496580123901, "logits/rejected": -1.461118221282959, "logps/chosen": -85.55781555175781, "logps/rejected": -15.024065017700195, "loss": 0.0432, "rewards/accuracies": 1.0, "rewards/chosen": 5.5722808837890625, "rewards/margins": 2.648145914077759, "rewards/rejected": 2.9241349697113037, "step": 7465 }, { "epoch": 1.65, "learning_rate": 7.711536815739407e-07, "logits/chosen": -1.6809639930725098, "logits/rejected": -1.6725635528564453, "logps/chosen": -60.298118591308594, "logps/rejected": -52.60279846191406, "loss": 0.2324, "rewards/accuracies": 1.0, "rewards/chosen": 4.691849708557129, "rewards/margins": 0.5415887832641602, "rewards/rejected": 4.150260925292969, "step": 7466 }, { "epoch": 1.65, "learning_rate": 7.701976587465204e-07, "logits/chosen": -1.9995869398117065, "logits/rejected": -1.9530643224716187, "logps/chosen": -194.9044189453125, "logps/rejected": -26.166973114013672, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 6.653485298156738, "rewards/margins": 4.917186737060547, "rewards/rejected": 1.7362987995147705, "step": 7467 }, { "epoch": 1.65, "learning_rate": 7.692421794385313e-07, "logits/chosen": -1.5333080291748047, "logits/rejected": -1.5359755754470825, "logps/chosen": -48.721500396728516, "logps/rejected": -78.02438354492188, "loss": 0.241, "rewards/accuracies": 1.0, "rewards/chosen": 4.060426235198975, "rewards/margins": 0.5236341953277588, "rewards/rejected": 3.536792039871216, "step": 7468 }, { "epoch": 1.65, "learning_rate": 7.682872437727518e-07, "logits/chosen": -2.0641958713531494, "logits/rejected": -1.9683645963668823, "logps/chosen": -59.58021545410156, "logps/rejected": -25.28766441345215, "loss": 0.1094, "rewards/accuracies": 1.0, "rewards/chosen": 4.1289215087890625, "rewards/margins": 2.4587628841400146, "rewards/rejected": 1.6701586246490479, "step": 7469 }, { "epoch": 1.65, "learning_rate": 7.673328518718892e-07, "logits/chosen": -2.125744104385376, "logits/rejected": -2.1077518463134766, "logps/chosen": -48.819801330566406, "logps/rejected": -43.843841552734375, "loss": 0.7571, "rewards/accuracies": 0.0, "rewards/chosen": 3.6950058937072754, "rewards/margins": -0.21415090560913086, "rewards/rejected": 3.9091567993164062, "step": 7470 }, { "epoch": 1.65, "learning_rate": 7.663790038585794e-07, "logits/chosen": -2.2445642948150635, "logits/rejected": -2.2666001319885254, "logps/chosen": -122.19906616210938, "logps/rejected": -48.46867370605469, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 8.505609512329102, "rewards/margins": 6.012855529785156, "rewards/rejected": 2.492753744125366, "step": 7471 }, { "epoch": 1.65, "learning_rate": 7.654256998553905e-07, "logits/chosen": -2.1000583171844482, "logits/rejected": -2.0061216354370117, "logps/chosen": -180.9547119140625, "logps/rejected": -46.87874984741211, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 7.24847412109375, "rewards/margins": 2.9326725006103516, "rewards/rejected": 4.315801620483398, "step": 7472 }, { "epoch": 1.65, "learning_rate": 7.644729399848199e-07, "logits/chosen": -2.159438133239746, "logits/rejected": -2.2273645401000977, "logps/chosen": -36.71445846557617, "logps/rejected": -72.50557708740234, "loss": 0.9837, "rewards/accuracies": 0.0, "rewards/chosen": 3.8246853351593018, "rewards/margins": -1.4318859577178955, "rewards/rejected": 5.256571292877197, "step": 7473 }, { "epoch": 1.65, "learning_rate": 7.635207243692938e-07, "logits/chosen": -1.7185108661651611, "logits/rejected": -1.6949477195739746, "logps/chosen": -106.34124755859375, "logps/rejected": -68.3425064086914, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": 7.44478178024292, "rewards/margins": 3.557736396789551, "rewards/rejected": 3.887045383453369, "step": 7474 }, { "epoch": 1.65, "learning_rate": 7.625690531311725e-07, "logits/chosen": -1.792790174484253, "logits/rejected": -1.8581141233444214, "logps/chosen": -63.23580551147461, "logps/rejected": -131.09934997558594, "loss": 0.3957, "rewards/accuracies": 0.0, "rewards/chosen": 13.294568061828613, "rewards/margins": -0.13966846466064453, "rewards/rejected": 13.434236526489258, "step": 7475 }, { "epoch": 1.65, "learning_rate": 7.616179263927381e-07, "logits/chosen": -1.95918607711792, "logits/rejected": -1.9329345226287842, "logps/chosen": -48.917015075683594, "logps/rejected": -58.949501037597656, "loss": 0.2651, "rewards/accuracies": 1.0, "rewards/chosen": 4.439631938934326, "rewards/margins": 0.4944026470184326, "rewards/rejected": 3.9452292919158936, "step": 7476 }, { "epoch": 1.65, "learning_rate": 7.606673442762147e-07, "logits/chosen": -1.840001106262207, "logits/rejected": -1.847149133682251, "logps/chosen": -25.66175651550293, "logps/rejected": -32.7888069152832, "loss": 0.5016, "rewards/accuracies": 1.0, "rewards/chosen": 3.1566615104675293, "rewards/margins": 0.3179469108581543, "rewards/rejected": 2.838714599609375, "step": 7477 }, { "epoch": 1.66, "learning_rate": 7.597173069037444e-07, "logits/chosen": -2.126132011413574, "logits/rejected": -1.5813616514205933, "logps/chosen": -44.9754638671875, "logps/rejected": -72.02947998046875, "loss": 0.141, "rewards/accuracies": 1.0, "rewards/chosen": 3.4181067943573, "rewards/margins": 1.1399328708648682, "rewards/rejected": 2.2781739234924316, "step": 7478 }, { "epoch": 1.66, "learning_rate": 7.587678143974075e-07, "logits/chosen": -1.9261486530303955, "logits/rejected": -1.9729130268096924, "logps/chosen": -35.5695915222168, "logps/rejected": -73.88251495361328, "loss": 0.0974, "rewards/accuracies": 1.0, "rewards/chosen": 4.675668716430664, "rewards/margins": 1.5404512882232666, "rewards/rejected": 3.1352174282073975, "step": 7479 }, { "epoch": 1.66, "learning_rate": 7.578188668792103e-07, "logits/chosen": -1.8829982280731201, "logits/rejected": -1.9368096590042114, "logps/chosen": -21.5517578125, "logps/rejected": -52.93022918701172, "loss": 2.3617, "rewards/accuracies": 0.0, "rewards/chosen": 2.017855167388916, "rewards/margins": -0.889594316482544, "rewards/rejected": 2.90744948387146, "step": 7480 }, { "epoch": 1.66, "learning_rate": 7.568704644710912e-07, "logits/chosen": -1.7051615715026855, "logits/rejected": -1.6668505668640137, "logps/chosen": -67.91116333007812, "logps/rejected": -56.14533233642578, "loss": 0.4348, "rewards/accuracies": 0.0, "rewards/chosen": 6.553840637207031, "rewards/margins": -0.13748884201049805, "rewards/rejected": 6.691329479217529, "step": 7481 }, { "epoch": 1.66, "learning_rate": 7.559226072949166e-07, "logits/chosen": -1.748255968093872, "logits/rejected": -1.763168215751648, "logps/chosen": -50.502967834472656, "logps/rejected": -41.66636276245117, "loss": 0.3135, "rewards/accuracies": 1.0, "rewards/chosen": 3.051474094390869, "rewards/margins": 0.13757061958312988, "rewards/rejected": 2.9139034748077393, "step": 7482 }, { "epoch": 1.66, "learning_rate": 7.549752954724854e-07, "logits/chosen": -2.3569424152374268, "logits/rejected": -2.4000394344329834, "logps/chosen": -107.42527770996094, "logps/rejected": -48.49522399902344, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 7.136531352996826, "rewards/margins": 3.1799235343933105, "rewards/rejected": 3.9566078186035156, "step": 7483 }, { "epoch": 1.66, "learning_rate": 7.540285291255245e-07, "logits/chosen": -1.9623281955718994, "logits/rejected": -1.540202021598816, "logps/chosen": -113.13357543945312, "logps/rejected": -50.611759185791016, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 7.357921123504639, "rewards/margins": 6.508329391479492, "rewards/rejected": 0.849591851234436, "step": 7484 }, { "epoch": 1.66, "learning_rate": 7.530823083756883e-07, "logits/chosen": -1.8423100709915161, "logits/rejected": -1.83719801902771, "logps/chosen": -50.441062927246094, "logps/rejected": -38.306884765625, "loss": 0.2153, "rewards/accuracies": 1.0, "rewards/chosen": 3.1758086681365967, "rewards/margins": 1.2924151420593262, "rewards/rejected": 1.8833935260772705, "step": 7485 }, { "epoch": 1.66, "learning_rate": 7.521366333445684e-07, "logits/chosen": -1.8798952102661133, "logits/rejected": -1.8299165964126587, "logps/chosen": -53.540367126464844, "logps/rejected": -34.263065338134766, "loss": 0.3662, "rewards/accuracies": 1.0, "rewards/chosen": 4.690915584564209, "rewards/margins": 0.2458949089050293, "rewards/rejected": 4.44502067565918, "step": 7486 }, { "epoch": 1.66, "learning_rate": 7.511915041536777e-07, "logits/chosen": -2.0348751544952393, "logits/rejected": -1.9618921279907227, "logps/chosen": -119.89607238769531, "logps/rejected": -77.87165832519531, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 5.731300354003906, "rewards/margins": 3.196997880935669, "rewards/rejected": 2.5343024730682373, "step": 7487 }, { "epoch": 1.66, "learning_rate": 7.502469209244667e-07, "logits/chosen": -1.7518653869628906, "logits/rejected": -1.7201366424560547, "logps/chosen": -66.91313934326172, "logps/rejected": -86.06731414794922, "loss": 0.1393, "rewards/accuracies": 1.0, "rewards/chosen": 5.831972599029541, "rewards/margins": 1.338435173034668, "rewards/rejected": 4.493537425994873, "step": 7488 }, { "epoch": 1.66, "learning_rate": 7.49302883778309e-07, "logits/chosen": -1.7720723152160645, "logits/rejected": -1.687691569328308, "logps/chosen": -69.71113586425781, "logps/rejected": -54.808692932128906, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 6.626853942871094, "rewards/margins": 5.688045501708984, "rewards/rejected": 0.9388084411621094, "step": 7489 }, { "epoch": 1.66, "learning_rate": 7.483593928365129e-07, "logits/chosen": -2.2590367794036865, "logits/rejected": -2.2614550590515137, "logps/chosen": -39.70976257324219, "logps/rejected": -35.15196228027344, "loss": 0.3158, "rewards/accuracies": 1.0, "rewards/chosen": 3.6923768520355225, "rewards/margins": 0.14027643203735352, "rewards/rejected": 3.552100419998169, "step": 7490 }, { "epoch": 1.66, "learning_rate": 7.474164482203139e-07, "logits/chosen": -1.949519395828247, "logits/rejected": -1.9429271221160889, "logps/chosen": -35.51051330566406, "logps/rejected": -57.71228790283203, "loss": 1.4521, "rewards/accuracies": 0.0, "rewards/chosen": 2.165976047515869, "rewards/margins": -0.6651434898376465, "rewards/rejected": 2.8311195373535156, "step": 7491 }, { "epoch": 1.66, "learning_rate": 7.464740500508783e-07, "logits/chosen": -1.861655354499817, "logits/rejected": -1.7519035339355469, "logps/chosen": -57.714637756347656, "logps/rejected": -39.82335662841797, "loss": 0.3521, "rewards/accuracies": 1.0, "rewards/chosen": 2.589759111404419, "rewards/margins": 1.7441201210021973, "rewards/rejected": 0.8456390500068665, "step": 7492 }, { "epoch": 1.66, "learning_rate": 7.45532198449303e-07, "logits/chosen": -1.9002113342285156, "logits/rejected": -1.8464876413345337, "logps/chosen": -153.44131469726562, "logps/rejected": -141.2030029296875, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": 7.6443681716918945, "rewards/margins": 2.5770277976989746, "rewards/rejected": 5.06734037399292, "step": 7493 }, { "epoch": 1.66, "learning_rate": 7.445908935366109e-07, "logits/chosen": -1.6926662921905518, "logits/rejected": -1.6620744466781616, "logps/chosen": -40.148231506347656, "logps/rejected": -42.79763412475586, "loss": 0.3992, "rewards/accuracies": 0.0, "rewards/chosen": 3.3857247829437256, "rewards/margins": -0.07068729400634766, "rewards/rejected": 3.4564120769500732, "step": 7494 }, { "epoch": 1.66, "learning_rate": 7.43650135433761e-07, "logits/chosen": -1.742472767829895, "logits/rejected": -1.6481062173843384, "logps/chosen": -31.261951446533203, "logps/rejected": -14.739052772521973, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": 4.194161891937256, "rewards/margins": 4.109076499938965, "rewards/rejected": 0.08508539199829102, "step": 7495 }, { "epoch": 1.66, "learning_rate": 7.427099242616348e-07, "logits/chosen": -2.0987653732299805, "logits/rejected": -2.050612688064575, "logps/chosen": -80.35067749023438, "logps/rejected": -106.5587387084961, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": 6.927931308746338, "rewards/margins": 4.129843711853027, "rewards/rejected": 2.7980873584747314, "step": 7496 }, { "epoch": 1.66, "learning_rate": 7.417702601410515e-07, "logits/chosen": -1.8630892038345337, "logits/rejected": -1.8630892038345337, "logps/chosen": -32.40155029296875, "logps/rejected": -32.40155029296875, "loss": 0.3613, "rewards/accuracies": 0.0, "rewards/chosen": 1.9456017017364502, "rewards/margins": 0.0, "rewards/rejected": 1.9456017017364502, "step": 7497 }, { "epoch": 1.66, "learning_rate": 7.408311431927517e-07, "logits/chosen": -2.0338995456695557, "logits/rejected": -2.022857427597046, "logps/chosen": -32.58555603027344, "logps/rejected": -151.1388702392578, "loss": 0.1134, "rewards/accuracies": 1.0, "rewards/chosen": 2.7796528339385986, "rewards/margins": 1.43256413936615, "rewards/rejected": 1.3470886945724487, "step": 7498 }, { "epoch": 1.66, "learning_rate": 7.398925735374119e-07, "logits/chosen": -1.930228352546692, "logits/rejected": -1.9498906135559082, "logps/chosen": -49.52534484863281, "logps/rejected": -52.37781524658203, "loss": 0.5582, "rewards/accuracies": 1.0, "rewards/chosen": 3.014514207839966, "rewards/margins": 0.13599395751953125, "rewards/rejected": 2.8785202503204346, "step": 7499 }, { "epoch": 1.66, "learning_rate": 7.389545512956347e-07, "logits/chosen": -2.0876686573028564, "logits/rejected": -2.0883898735046387, "logps/chosen": -97.40284729003906, "logps/rejected": -51.270111083984375, "loss": 0.2895, "rewards/accuracies": 1.0, "rewards/chosen": 6.622079372406006, "rewards/margins": 0.3145637512207031, "rewards/rejected": 6.307515621185303, "step": 7500 }, { "epoch": 1.66, "learning_rate": 7.380170765879547e-07, "logits/chosen": -1.8358330726623535, "logits/rejected": -1.8358330726623535, "logps/chosen": -24.552982330322266, "logps/rejected": -24.552982330322266, "loss": 0.4781, "rewards/accuracies": 0.0, "rewards/chosen": 1.4414719343185425, "rewards/margins": 0.0, "rewards/rejected": 1.4414719343185425, "step": 7501 }, { "epoch": 1.66, "learning_rate": 7.370801495348362e-07, "logits/chosen": -1.912632942199707, "logits/rejected": -1.9498497247695923, "logps/chosen": -48.471214294433594, "logps/rejected": -102.83975982666016, "loss": 0.9011, "rewards/accuracies": 0.0, "rewards/chosen": 4.207581520080566, "rewards/margins": -1.509028434753418, "rewards/rejected": 5.716609954833984, "step": 7502 }, { "epoch": 1.66, "learning_rate": 7.361437702566681e-07, "logits/chosen": -1.8369320631027222, "logits/rejected": -1.8451449871063232, "logps/chosen": -48.33997344970703, "logps/rejected": -139.12420654296875, "loss": 0.0675, "rewards/accuracies": 1.0, "rewards/chosen": 6.890132904052734, "rewards/margins": 2.0231008529663086, "rewards/rejected": 4.867032051086426, "step": 7503 }, { "epoch": 1.66, "learning_rate": 7.352079388737777e-07, "logits/chosen": -1.9040799140930176, "logits/rejected": -1.842009425163269, "logps/chosen": -42.433563232421875, "logps/rejected": -34.31937789916992, "loss": 0.3553, "rewards/accuracies": 1.0, "rewards/chosen": 5.622018337249756, "rewards/margins": 4.037160396575928, "rewards/rejected": 1.5848579406738281, "step": 7504 }, { "epoch": 1.66, "learning_rate": 7.342726555064129e-07, "logits/chosen": -1.9668090343475342, "logits/rejected": -1.8878424167633057, "logps/chosen": -83.57266235351562, "logps/rejected": -29.88619613647461, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 7.5497894287109375, "rewards/margins": 5.018310070037842, "rewards/rejected": 2.5314793586730957, "step": 7505 }, { "epoch": 1.66, "learning_rate": 7.333379202747592e-07, "logits/chosen": -1.9971461296081543, "logits/rejected": -1.9800149202346802, "logps/chosen": -83.11918640136719, "logps/rejected": -63.677825927734375, "loss": 0.3138, "rewards/accuracies": 1.0, "rewards/chosen": 5.292050361633301, "rewards/margins": 4.049134254455566, "rewards/rejected": 1.2429161071777344, "step": 7506 }, { "epoch": 1.66, "learning_rate": 7.324037332989253e-07, "logits/chosen": -2.036400318145752, "logits/rejected": -1.9647804498672485, "logps/chosen": -127.5218505859375, "logps/rejected": -41.193450927734375, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": 7.480947971343994, "rewards/margins": 2.1020827293395996, "rewards/rejected": 5.3788652420043945, "step": 7507 }, { "epoch": 1.66, "learning_rate": 7.314700946989528e-07, "logits/chosen": -1.7614082098007202, "logits/rejected": -1.7272356748580933, "logps/chosen": -33.92133712768555, "logps/rejected": -21.376333236694336, "loss": 0.1036, "rewards/accuracies": 1.0, "rewards/chosen": 3.8621692657470703, "rewards/margins": 2.1276068687438965, "rewards/rejected": 1.7345625162124634, "step": 7508 }, { "epoch": 1.66, "learning_rate": 7.305370045948112e-07, "logits/chosen": -1.6225183010101318, "logits/rejected": -1.6085542440414429, "logps/chosen": -8.959320068359375, "logps/rejected": -13.06141185760498, "loss": 1.2502, "rewards/accuracies": 0.0, "rewards/chosen": 0.9822344183921814, "rewards/margins": -2.2786264419555664, "rewards/rejected": 3.2608609199523926, "step": 7509 }, { "epoch": 1.66, "learning_rate": 7.296044631064014e-07, "logits/chosen": -1.717189073562622, "logits/rejected": -1.7648359537124634, "logps/chosen": -40.104896545410156, "logps/rejected": -56.79273986816406, "loss": 0.6432, "rewards/accuracies": 0.0, "rewards/chosen": 4.863924503326416, "rewards/margins": -0.5591225624084473, "rewards/rejected": 5.423047065734863, "step": 7510 }, { "epoch": 1.66, "learning_rate": 7.286724703535535e-07, "logits/chosen": -1.8525972366333008, "logits/rejected": -1.7020344734191895, "logps/chosen": -37.60059356689453, "logps/rejected": -19.58062744140625, "loss": 0.1514, "rewards/accuracies": 1.0, "rewards/chosen": 4.696688175201416, "rewards/margins": 4.126953125, "rewards/rejected": 0.5697349905967712, "step": 7511 }, { "epoch": 1.66, "learning_rate": 7.277410264560225e-07, "logits/chosen": -2.080359935760498, "logits/rejected": -2.094404935836792, "logps/chosen": -32.31554412841797, "logps/rejected": -59.60110092163086, "loss": 0.254, "rewards/accuracies": 1.0, "rewards/chosen": 4.214075565338135, "rewards/margins": 1.0903401374816895, "rewards/rejected": 3.1237354278564453, "step": 7512 }, { "epoch": 1.66, "learning_rate": 7.268101315335024e-07, "logits/chosen": -1.8509284257888794, "logits/rejected": -1.8509284257888794, "logps/chosen": -47.825767517089844, "logps/rejected": -47.825767517089844, "loss": 0.3469, "rewards/accuracies": 0.0, "rewards/chosen": 4.019700050354004, "rewards/margins": 0.0, "rewards/rejected": 4.019700050354004, "step": 7513 }, { "epoch": 1.66, "learning_rate": 7.258797857056049e-07, "logits/chosen": -2.306727170944214, "logits/rejected": -2.320829391479492, "logps/chosen": -75.03242492675781, "logps/rejected": -44.76421356201172, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 9.435811042785645, "rewards/margins": 5.398886203765869, "rewards/rejected": 4.036924839019775, "step": 7514 }, { "epoch": 1.66, "learning_rate": 7.249499890918832e-07, "logits/chosen": -1.9054077863693237, "logits/rejected": -1.730302333831787, "logps/chosen": -130.1002197265625, "logps/rejected": -34.548439025878906, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 8.624648094177246, "rewards/margins": 8.35605239868164, "rewards/rejected": 0.26859551668167114, "step": 7515 }, { "epoch": 1.66, "learning_rate": 7.240207418118094e-07, "logits/chosen": -1.946213722229004, "logits/rejected": -2.002035617828369, "logps/chosen": -27.898975372314453, "logps/rejected": -72.46762084960938, "loss": 0.9397, "rewards/accuracies": 0.0, "rewards/chosen": 3.1489875316619873, "rewards/margins": -1.7130167484283447, "rewards/rejected": 4.862004280090332, "step": 7516 }, { "epoch": 1.66, "learning_rate": 7.23092043984791e-07, "logits/chosen": -1.8101085424423218, "logits/rejected": -1.7038272619247437, "logps/chosen": -39.784393310546875, "logps/rejected": -8.601875305175781, "loss": 0.157, "rewards/accuracies": 1.0, "rewards/chosen": 2.262688398361206, "rewards/margins": 1.1990464925765991, "rewards/rejected": 1.063641905784607, "step": 7517 }, { "epoch": 1.66, "learning_rate": 7.221638957301635e-07, "logits/chosen": -2.032017707824707, "logits/rejected": -2.001312017440796, "logps/chosen": -55.06962585449219, "logps/rejected": -52.628353118896484, "loss": 0.3701, "rewards/accuracies": 1.0, "rewards/chosen": 3.4428040981292725, "rewards/margins": 0.06760311126708984, "rewards/rejected": 3.3752009868621826, "step": 7518 }, { "epoch": 1.66, "learning_rate": 7.212362971671922e-07, "logits/chosen": -1.9961886405944824, "logits/rejected": -1.760490894317627, "logps/chosen": -67.89338684082031, "logps/rejected": -19.487009048461914, "loss": 0.2155, "rewards/accuracies": 1.0, "rewards/chosen": 2.4466233253479004, "rewards/margins": 1.0081778764724731, "rewards/rejected": 1.4384454488754272, "step": 7519 }, { "epoch": 1.66, "learning_rate": 7.20309248415072e-07, "logits/chosen": -1.847305417060852, "logits/rejected": -1.8512028455734253, "logps/chosen": -42.89308166503906, "logps/rejected": -73.68550109863281, "loss": 0.711, "rewards/accuracies": 0.0, "rewards/chosen": 4.240633487701416, "rewards/margins": -1.0675148963928223, "rewards/rejected": 5.308148384094238, "step": 7520 }, { "epoch": 1.66, "learning_rate": 7.193827495929223e-07, "logits/chosen": -1.6990952491760254, "logits/rejected": -1.721909761428833, "logps/chosen": -18.590328216552734, "logps/rejected": -60.55075454711914, "loss": 0.6165, "rewards/accuracies": 1.0, "rewards/chosen": 3.2120678424835205, "rewards/margins": 0.3346221446990967, "rewards/rejected": 2.877445697784424, "step": 7521 }, { "epoch": 1.66, "learning_rate": 7.184568008198018e-07, "logits/chosen": -1.973956823348999, "logits/rejected": -2.024916410446167, "logps/chosen": -113.30264282226562, "logps/rejected": -98.65788269042969, "loss": 0.0949, "rewards/accuracies": 1.0, "rewards/chosen": 11.779637336730957, "rewards/margins": 1.6346406936645508, "rewards/rejected": 10.144996643066406, "step": 7522 }, { "epoch": 1.67, "learning_rate": 7.175314022146879e-07, "logits/chosen": -1.5695617198944092, "logits/rejected": -1.5695617198944092, "logps/chosen": -3.1419546604156494, "logps/rejected": -3.1419546604156494, "loss": 1.2059, "rewards/accuracies": 0.0, "rewards/chosen": 0.9637819528579712, "rewards/margins": 0.0, "rewards/rejected": 0.9637819528579712, "step": 7523 }, { "epoch": 1.67, "learning_rate": 7.166065538964955e-07, "logits/chosen": -2.0155375003814697, "logits/rejected": -2.002723455429077, "logps/chosen": -60.206581115722656, "logps/rejected": -151.10223388671875, "loss": 0.1805, "rewards/accuracies": 1.0, "rewards/chosen": 8.979722023010254, "rewards/margins": 0.8337488174438477, "rewards/rejected": 8.145973205566406, "step": 7524 }, { "epoch": 1.67, "learning_rate": 7.156822559840632e-07, "logits/chosen": -1.852631688117981, "logits/rejected": -1.852631688117981, "logps/chosen": -45.95232009887695, "logps/rejected": -45.95232009887695, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 5.7383832931518555, "rewards/margins": 0.0, "rewards/rejected": 5.7383832931518555, "step": 7525 }, { "epoch": 1.67, "learning_rate": 7.147585085961623e-07, "logits/chosen": -1.851878046989441, "logits/rejected": -1.7494091987609863, "logps/chosen": -100.52764892578125, "logps/rejected": -51.03801727294922, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 5.4805192947387695, "rewards/margins": 2.4904534816741943, "rewards/rejected": 2.990065813064575, "step": 7526 }, { "epoch": 1.67, "learning_rate": 7.138353118514907e-07, "logits/chosen": -1.9954497814178467, "logits/rejected": -1.9355638027191162, "logps/chosen": -74.21138000488281, "logps/rejected": -63.81971740722656, "loss": 0.1801, "rewards/accuracies": 1.0, "rewards/chosen": 5.521359443664551, "rewards/margins": 0.8375430107116699, "rewards/rejected": 4.683816432952881, "step": 7527 }, { "epoch": 1.67, "learning_rate": 7.129126658686785e-07, "logits/chosen": -1.917305588722229, "logits/rejected": -1.939802885055542, "logps/chosen": -78.62937927246094, "logps/rejected": -101.67121124267578, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 9.90442180633545, "rewards/margins": 0.5578327178955078, "rewards/rejected": 9.346589088439941, "step": 7528 }, { "epoch": 1.67, "learning_rate": 7.119905707662838e-07, "logits/chosen": -2.332538366317749, "logits/rejected": -2.321789264678955, "logps/chosen": -41.3560905456543, "logps/rejected": -69.70638275146484, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": 4.205733776092529, "rewards/margins": 1.5303752422332764, "rewards/rejected": 2.675358533859253, "step": 7529 }, { "epoch": 1.67, "learning_rate": 7.110690266627901e-07, "logits/chosen": -2.0221340656280518, "logits/rejected": -1.912063479423523, "logps/chosen": -119.07757568359375, "logps/rejected": -58.90836715698242, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 6.333108425140381, "rewards/margins": 2.5925004482269287, "rewards/rejected": 3.740607976913452, "step": 7530 }, { "epoch": 1.67, "learning_rate": 7.101480336766187e-07, "logits/chosen": -1.9177913665771484, "logits/rejected": -1.835532546043396, "logps/chosen": -47.83793258666992, "logps/rejected": -76.83940124511719, "loss": 0.1174, "rewards/accuracies": 1.0, "rewards/chosen": 5.737032890319824, "rewards/margins": 1.58656644821167, "rewards/rejected": 4.150466442108154, "step": 7531 }, { "epoch": 1.67, "learning_rate": 7.092275919261104e-07, "logits/chosen": -1.8265267610549927, "logits/rejected": -1.8149793148040771, "logps/chosen": -25.84961700439453, "logps/rejected": -40.030616760253906, "loss": 0.287, "rewards/accuracies": 1.0, "rewards/chosen": 3.965412139892578, "rewards/margins": 0.2554352283477783, "rewards/rejected": 3.7099769115448, "step": 7532 }, { "epoch": 1.67, "learning_rate": 7.083077015295437e-07, "logits/chosen": -1.6826826333999634, "logits/rejected": -1.6712852716445923, "logps/chosen": -47.292301177978516, "logps/rejected": -26.207347869873047, "loss": 0.8797, "rewards/accuracies": 0.0, "rewards/chosen": 2.232898473739624, "rewards/margins": -0.9972782135009766, "rewards/rejected": 3.2301766872406006, "step": 7533 }, { "epoch": 1.67, "learning_rate": 7.073883626051198e-07, "logits/chosen": -1.8431565761566162, "logits/rejected": -1.8338866233825684, "logps/chosen": -41.843971252441406, "logps/rejected": -74.42626953125, "loss": 0.2361, "rewards/accuracies": 1.0, "rewards/chosen": 2.7745094299316406, "rewards/margins": 0.7156882286071777, "rewards/rejected": 2.058821201324463, "step": 7534 }, { "epoch": 1.67, "learning_rate": 7.064695752709715e-07, "logits/chosen": -2.0227394104003906, "logits/rejected": -2.0111610889434814, "logps/chosen": -104.50199127197266, "logps/rejected": -152.28305053710938, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": 9.753820419311523, "rewards/margins": 4.459255695343018, "rewards/rejected": 5.294564723968506, "step": 7535 }, { "epoch": 1.67, "learning_rate": 7.055513396451619e-07, "logits/chosen": -2.101100206375122, "logits/rejected": -2.1086418628692627, "logps/chosen": -45.618446350097656, "logps/rejected": -70.41556549072266, "loss": 1.3405, "rewards/accuracies": 1.0, "rewards/chosen": 4.341531276702881, "rewards/margins": 2.779660701751709, "rewards/rejected": 1.5618705749511719, "step": 7536 }, { "epoch": 1.67, "learning_rate": 7.046336558456812e-07, "logits/chosen": -1.9610079526901245, "logits/rejected": -1.9544832706451416, "logps/chosen": -25.20751953125, "logps/rejected": -67.90748596191406, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 3.6208646297454834, "rewards/margins": 0.5586864948272705, "rewards/rejected": 3.062178134918213, "step": 7537 }, { "epoch": 1.67, "learning_rate": 7.037165239904514e-07, "logits/chosen": -2.149925947189331, "logits/rejected": -2.149690628051758, "logps/chosen": -56.337486267089844, "logps/rejected": -128.80775451660156, "loss": 0.8529, "rewards/accuracies": 0.0, "rewards/chosen": 9.026530265808105, "rewards/margins": -0.5928077697753906, "rewards/rejected": 9.619338035583496, "step": 7538 }, { "epoch": 1.67, "learning_rate": 7.027999441973177e-07, "logits/chosen": -2.125115156173706, "logits/rejected": -2.0049731731414795, "logps/chosen": -98.35972595214844, "logps/rejected": -64.08431243896484, "loss": 0.3286, "rewards/accuracies": 1.0, "rewards/chosen": 7.920620918273926, "rewards/margins": 3.4994211196899414, "rewards/rejected": 4.421199798583984, "step": 7539 }, { "epoch": 1.67, "learning_rate": 7.018839165840635e-07, "logits/chosen": -2.120488166809082, "logits/rejected": -1.9842933416366577, "logps/chosen": -103.21778869628906, "logps/rejected": -13.99791145324707, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": 5.120219707489014, "rewards/margins": 3.359898090362549, "rewards/rejected": 1.7603216171264648, "step": 7540 }, { "epoch": 1.67, "learning_rate": 7.009684412683915e-07, "logits/chosen": -2.190021276473999, "logits/rejected": -2.2310776710510254, "logps/chosen": -7.420714855194092, "logps/rejected": -43.982757568359375, "loss": 0.8605, "rewards/accuracies": 0.0, "rewards/chosen": 2.816692352294922, "rewards/margins": -1.4929094314575195, "rewards/rejected": 4.309601783752441, "step": 7541 }, { "epoch": 1.67, "learning_rate": 7.000535183679425e-07, "logits/chosen": -1.7504339218139648, "logits/rejected": -1.7504339218139648, "logps/chosen": -31.55446434020996, "logps/rejected": -31.55446434020996, "loss": 2.3155, "rewards/accuracies": 0.0, "rewards/chosen": 7.257549285888672, "rewards/margins": 0.0, "rewards/rejected": 7.257549285888672, "step": 7542 }, { "epoch": 1.67, "learning_rate": 6.99139148000278e-07, "logits/chosen": -2.0459601879119873, "logits/rejected": -1.8175463676452637, "logps/chosen": -99.3537826538086, "logps/rejected": -66.40664672851562, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": 7.200411319732666, "rewards/margins": 3.006890296936035, "rewards/rejected": 4.193521022796631, "step": 7543 }, { "epoch": 1.67, "learning_rate": 6.982253302828945e-07, "logits/chosen": -2.0505356788635254, "logits/rejected": -2.0284247398376465, "logps/chosen": -111.50251007080078, "logps/rejected": -64.11280822753906, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 6.637832164764404, "rewards/margins": 3.7951066493988037, "rewards/rejected": 2.8427255153656006, "step": 7544 }, { "epoch": 1.67, "learning_rate": 6.973120653332144e-07, "logits/chosen": -2.066737413406372, "logits/rejected": -2.0970935821533203, "logps/chosen": -35.96003723144531, "logps/rejected": -120.25201416015625, "loss": 3.149, "rewards/accuracies": 0.0, "rewards/chosen": 3.774919271469116, "rewards/margins": -6.218046188354492, "rewards/rejected": 9.992965698242188, "step": 7545 }, { "epoch": 1.67, "learning_rate": 6.963993532685909e-07, "logits/chosen": -1.854344367980957, "logits/rejected": -1.854344367980957, "logps/chosen": -35.37953186035156, "logps/rejected": -35.37953186035156, "loss": 0.3736, "rewards/accuracies": 0.0, "rewards/chosen": 2.964681386947632, "rewards/margins": 0.0, "rewards/rejected": 2.964681386947632, "step": 7546 }, { "epoch": 1.67, "learning_rate": 6.954871942063046e-07, "logits/chosen": -1.8521546125411987, "logits/rejected": -1.771072506904602, "logps/chosen": -73.46554565429688, "logps/rejected": -49.569278717041016, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 8.46349811553955, "rewards/margins": 4.398914813995361, "rewards/rejected": 4.0645833015441895, "step": 7547 }, { "epoch": 1.67, "learning_rate": 6.945755882635663e-07, "logits/chosen": -1.811806559562683, "logits/rejected": -1.8475240468978882, "logps/chosen": -22.603782653808594, "logps/rejected": -107.39493560791016, "loss": 0.5461, "rewards/accuracies": 0.0, "rewards/chosen": 4.769512176513672, "rewards/margins": -0.5758957862854004, "rewards/rejected": 5.345407962799072, "step": 7548 }, { "epoch": 1.67, "learning_rate": 6.936645355575162e-07, "logits/chosen": -2.0446648597717285, "logits/rejected": -1.9723809957504272, "logps/chosen": -65.50640106201172, "logps/rejected": -55.018272399902344, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 7.3766608238220215, "rewards/margins": 2.036538600921631, "rewards/rejected": 5.340122222900391, "step": 7549 }, { "epoch": 1.67, "learning_rate": 6.927540362052199e-07, "logits/chosen": -1.808292031288147, "logits/rejected": -1.796600103378296, "logps/chosen": -3.9761834144592285, "logps/rejected": -13.789770126342773, "loss": 0.4157, "rewards/accuracies": 1.0, "rewards/chosen": 2.025320291519165, "rewards/margins": 0.6716358661651611, "rewards/rejected": 1.353684425354004, "step": 7550 }, { "epoch": 1.67, "learning_rate": 6.918440903236756e-07, "logits/chosen": -2.0258960723876953, "logits/rejected": -2.0881268978118896, "logps/chosen": -65.14802551269531, "logps/rejected": -113.18763732910156, "loss": 0.3178, "rewards/accuracies": 1.0, "rewards/chosen": 8.9088716506958, "rewards/margins": 1.3905258178710938, "rewards/rejected": 7.518345832824707, "step": 7551 }, { "epoch": 1.67, "learning_rate": 6.909346980298093e-07, "logits/chosen": -1.9342705011367798, "logits/rejected": -1.9404858350753784, "logps/chosen": -24.139686584472656, "logps/rejected": -53.93657684326172, "loss": 0.7836, "rewards/accuracies": 0.0, "rewards/chosen": 3.1603164672851562, "rewards/margins": -0.40408945083618164, "rewards/rejected": 3.564405918121338, "step": 7552 }, { "epoch": 1.67, "learning_rate": 6.900258594404752e-07, "logits/chosen": -2.05401873588562, "logits/rejected": -1.9765355587005615, "logps/chosen": -150.69869995117188, "logps/rejected": -76.35948181152344, "loss": 0.3724, "rewards/accuracies": 0.0, "rewards/chosen": 6.798993110656738, "rewards/margins": -0.03534078598022461, "rewards/rejected": 6.834333896636963, "step": 7553 }, { "epoch": 1.67, "learning_rate": 6.891175746724577e-07, "logits/chosen": -1.837264895439148, "logits/rejected": -1.8347793817520142, "logps/chosen": -53.81990432739258, "logps/rejected": -60.412147521972656, "loss": 0.6137, "rewards/accuracies": 0.0, "rewards/chosen": 5.071101188659668, "rewards/margins": -0.8354344367980957, "rewards/rejected": 5.906535625457764, "step": 7554 }, { "epoch": 1.67, "learning_rate": 6.882098438424683e-07, "logits/chosen": -1.8438947200775146, "logits/rejected": -1.7732232809066772, "logps/chosen": -104.56153869628906, "logps/rejected": -85.49290466308594, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 7.650090217590332, "rewards/margins": 1.8799548149108887, "rewards/rejected": 5.770135402679443, "step": 7555 }, { "epoch": 1.67, "learning_rate": 6.873026670671495e-07, "logits/chosen": -1.7590229511260986, "logits/rejected": -1.6970120668411255, "logps/chosen": -63.086700439453125, "logps/rejected": -63.38504409790039, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": 3.1422417163848877, "rewards/margins": 2.1013729572296143, "rewards/rejected": 1.0408687591552734, "step": 7556 }, { "epoch": 1.67, "learning_rate": 6.863960444630702e-07, "logits/chosen": -1.9025813341140747, "logits/rejected": -1.9025813341140747, "logps/chosen": -39.6773681640625, "logps/rejected": -39.6773681640625, "loss": 0.3548, "rewards/accuracies": 0.0, "rewards/chosen": 3.7179458141326904, "rewards/margins": 0.0, "rewards/rejected": 3.7179458141326904, "step": 7557 }, { "epoch": 1.67, "learning_rate": 6.854899761467293e-07, "logits/chosen": -1.8601118326187134, "logits/rejected": -1.8742387294769287, "logps/chosen": -58.55253219604492, "logps/rejected": -112.83058166503906, "loss": 0.0774, "rewards/accuracies": 1.0, "rewards/chosen": 5.824818134307861, "rewards/margins": 2.041581392288208, "rewards/rejected": 3.7832367420196533, "step": 7558 }, { "epoch": 1.67, "learning_rate": 6.84584462234556e-07, "logits/chosen": -1.8831814527511597, "logits/rejected": -1.7932050228118896, "logps/chosen": -46.40388488769531, "logps/rejected": -12.6442289352417, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 4.054934024810791, "rewards/margins": 3.457998752593994, "rewards/rejected": 0.5969352126121521, "step": 7559 }, { "epoch": 1.67, "learning_rate": 6.836795028429039e-07, "logits/chosen": -1.8038326501846313, "logits/rejected": -1.8452985286712646, "logps/chosen": -95.98262023925781, "logps/rejected": -166.01370239257812, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 10.428382873535156, "rewards/margins": 3.2295684814453125, "rewards/rejected": 7.198814392089844, "step": 7560 }, { "epoch": 1.67, "learning_rate": 6.827750980880599e-07, "logits/chosen": -1.9362397193908691, "logits/rejected": -1.9607939720153809, "logps/chosen": -23.080781936645508, "logps/rejected": -53.060203552246094, "loss": 0.4358, "rewards/accuracies": 1.0, "rewards/chosen": 3.5308640003204346, "rewards/margins": 0.08148574829101562, "rewards/rejected": 3.449378252029419, "step": 7561 }, { "epoch": 1.67, "learning_rate": 6.818712480862371e-07, "logits/chosen": -1.9124234914779663, "logits/rejected": -1.9074411392211914, "logps/chosen": -71.9392318725586, "logps/rejected": -73.8128433227539, "loss": 0.1093, "rewards/accuracies": 1.0, "rewards/chosen": 4.937038421630859, "rewards/margins": 1.7191619873046875, "rewards/rejected": 3.217876434326172, "step": 7562 }, { "epoch": 1.67, "learning_rate": 6.80967952953579e-07, "logits/chosen": -1.9337382316589355, "logits/rejected": -1.8356503248214722, "logps/chosen": -189.08900451660156, "logps/rejected": -41.50531768798828, "loss": 0.2827, "rewards/accuracies": 1.0, "rewards/chosen": 4.401937961578369, "rewards/margins": 0.383817195892334, "rewards/rejected": 4.018120765686035, "step": 7563 }, { "epoch": 1.67, "learning_rate": 6.800652128061557e-07, "logits/chosen": -2.0215961933135986, "logits/rejected": -2.0415680408477783, "logps/chosen": -43.92682647705078, "logps/rejected": -95.18379974365234, "loss": 0.4104, "rewards/accuracies": 0.0, "rewards/chosen": 4.664888858795166, "rewards/margins": -0.10799884796142578, "rewards/rejected": 4.772887706756592, "step": 7564 }, { "epoch": 1.67, "learning_rate": 6.791630277599681e-07, "logits/chosen": -2.1827549934387207, "logits/rejected": -2.1713173389434814, "logps/chosen": -78.9940185546875, "logps/rejected": -121.15492248535156, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 7.353831768035889, "rewards/margins": 5.549330234527588, "rewards/rejected": 1.8045014142990112, "step": 7565 }, { "epoch": 1.67, "learning_rate": 6.782613979309443e-07, "logits/chosen": -1.9590075016021729, "logits/rejected": -1.987019658088684, "logps/chosen": -143.51864624023438, "logps/rejected": -88.43600463867188, "loss": 0.5151, "rewards/accuracies": 0.0, "rewards/chosen": 8.470511436462402, "rewards/margins": -0.5512170791625977, "rewards/rejected": 9.021728515625, "step": 7566 }, { "epoch": 1.67, "learning_rate": 6.773603234349418e-07, "logits/chosen": -1.8466217517852783, "logits/rejected": -1.8448221683502197, "logps/chosen": -34.526676177978516, "logps/rejected": -64.37997436523438, "loss": 1.4073, "rewards/accuracies": 1.0, "rewards/chosen": 3.9670169353485107, "rewards/margins": 1.4025728702545166, "rewards/rejected": 2.564444065093994, "step": 7567 }, { "epoch": 1.68, "learning_rate": 6.764598043877474e-07, "logits/chosen": -1.7306385040283203, "logits/rejected": -1.7306385040283203, "logps/chosen": -29.89189910888672, "logps/rejected": -29.89189910888672, "loss": 0.3476, "rewards/accuracies": 0.0, "rewards/chosen": 2.5659825801849365, "rewards/margins": 0.0, "rewards/rejected": 2.5659825801849365, "step": 7568 }, { "epoch": 1.68, "learning_rate": 6.755598409050734e-07, "logits/chosen": -1.8611137866973877, "logits/rejected": -1.8677736520767212, "logps/chosen": -76.07673645019531, "logps/rejected": -121.88369750976562, "loss": 0.5596, "rewards/accuracies": 0.0, "rewards/chosen": 7.121806621551514, "rewards/margins": -0.3838653564453125, "rewards/rejected": 7.505671977996826, "step": 7569 }, { "epoch": 1.68, "learning_rate": 6.746604331025642e-07, "logits/chosen": -1.7635408639907837, "logits/rejected": -1.608112096786499, "logps/chosen": -125.56724548339844, "logps/rejected": -11.831961631774902, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 5.877317905426025, "rewards/margins": 4.75062370300293, "rewards/rejected": 1.1266940832138062, "step": 7570 }, { "epoch": 1.68, "learning_rate": 6.73761581095792e-07, "logits/chosen": -1.937180757522583, "logits/rejected": -1.875017762184143, "logps/chosen": -34.470420837402344, "logps/rejected": -5.3106913566589355, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 3.224416494369507, "rewards/margins": 2.416386604309082, "rewards/rejected": 0.8080297708511353, "step": 7571 }, { "epoch": 1.68, "learning_rate": 6.728632850002559e-07, "logits/chosen": -1.9276524782180786, "logits/rejected": -1.8852211236953735, "logps/chosen": -59.91393280029297, "logps/rejected": -40.76129913330078, "loss": 0.0706, "rewards/accuracies": 1.0, "rewards/chosen": 3.8838775157928467, "rewards/margins": 1.889570951461792, "rewards/rejected": 1.9943065643310547, "step": 7572 }, { "epoch": 1.68, "learning_rate": 6.719655449313861e-07, "logits/chosen": -1.8573297262191772, "logits/rejected": -1.7969993352890015, "logps/chosen": -72.6721420288086, "logps/rejected": -39.09241485595703, "loss": 0.083, "rewards/accuracies": 1.0, "rewards/chosen": 3.8501548767089844, "rewards/margins": 1.930566430091858, "rewards/rejected": 1.9195884466171265, "step": 7573 }, { "epoch": 1.68, "learning_rate": 6.710683610045393e-07, "logits/chosen": -1.8647215366363525, "logits/rejected": -1.9213601350784302, "logps/chosen": -94.9416275024414, "logps/rejected": -162.79843139648438, "loss": 0.0798, "rewards/accuracies": 1.0, "rewards/chosen": 10.614623069763184, "rewards/margins": 2.081822395324707, "rewards/rejected": 8.532800674438477, "step": 7574 }, { "epoch": 1.68, "learning_rate": 6.701717333350016e-07, "logits/chosen": -1.8913272619247437, "logits/rejected": -1.883586049079895, "logps/chosen": -27.798473358154297, "logps/rejected": -46.4298210144043, "loss": 0.9397, "rewards/accuracies": 0.0, "rewards/chosen": 4.957284450531006, "rewards/margins": -0.05202913284301758, "rewards/rejected": 5.009313583374023, "step": 7575 }, { "epoch": 1.68, "learning_rate": 6.692756620379875e-07, "logits/chosen": -1.921242594718933, "logits/rejected": -1.9087786674499512, "logps/chosen": -43.43587875366211, "logps/rejected": -42.55546188354492, "loss": 0.7468, "rewards/accuracies": 0.0, "rewards/chosen": 6.333186149597168, "rewards/margins": -0.20431995391845703, "rewards/rejected": 6.537506103515625, "step": 7576 }, { "epoch": 1.68, "learning_rate": 6.683801472286411e-07, "logits/chosen": -1.8950632810592651, "logits/rejected": -1.7941832542419434, "logps/chosen": -107.05621337890625, "logps/rejected": -23.349016189575195, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 7.415426731109619, "rewards/margins": 6.3087239265441895, "rewards/rejected": 1.1067026853561401, "step": 7577 }, { "epoch": 1.68, "learning_rate": 6.674851890220307e-07, "logits/chosen": -2.138702869415283, "logits/rejected": -2.1501007080078125, "logps/chosen": -71.65190124511719, "logps/rejected": -60.55677795410156, "loss": 0.3825, "rewards/accuracies": 1.0, "rewards/chosen": 3.4317779541015625, "rewards/margins": 0.4328291416168213, "rewards/rejected": 2.998948812484741, "step": 7578 }, { "epoch": 1.68, "learning_rate": 6.665907875331612e-07, "logits/chosen": -1.9478607177734375, "logits/rejected": -1.8505440950393677, "logps/chosen": -83.27113342285156, "logps/rejected": -18.895078659057617, "loss": 0.0898, "rewards/accuracies": 1.0, "rewards/chosen": 5.481807231903076, "rewards/margins": 1.8931269645690918, "rewards/rejected": 3.5886802673339844, "step": 7579 }, { "epoch": 1.68, "learning_rate": 6.656969428769567e-07, "logits/chosen": -2.1519992351531982, "logits/rejected": -2.0752053260803223, "logps/chosen": -87.01502990722656, "logps/rejected": -47.932125091552734, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 7.603611946105957, "rewards/margins": 4.9416913986206055, "rewards/rejected": 2.6619205474853516, "step": 7580 }, { "epoch": 1.68, "learning_rate": 6.648036551682752e-07, "logits/chosen": -1.6241251230239868, "logits/rejected": -1.7505286931991577, "logps/chosen": -44.338436126708984, "logps/rejected": -84.86431884765625, "loss": 2.946, "rewards/accuracies": 0.0, "rewards/chosen": 4.111884117126465, "rewards/margins": -5.863512992858887, "rewards/rejected": 9.975397109985352, "step": 7581 }, { "epoch": 1.68, "learning_rate": 6.639109245219033e-07, "logits/chosen": -2.521981954574585, "logits/rejected": -2.528139591217041, "logps/chosen": -98.70735931396484, "logps/rejected": -125.34463500976562, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": 11.416706085205078, "rewards/margins": 2.1402931213378906, "rewards/rejected": 9.276412963867188, "step": 7582 }, { "epoch": 1.68, "learning_rate": 6.630187510525532e-07, "logits/chosen": -1.9622743129730225, "logits/rejected": -1.939103364944458, "logps/chosen": -31.56667709350586, "logps/rejected": -52.75656509399414, "loss": 0.4002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5506017208099365, "rewards/margins": 1.3558167219161987, "rewards/rejected": 1.1947849988937378, "step": 7583 }, { "epoch": 1.68, "learning_rate": 6.621271348748681e-07, "logits/chosen": -1.8869372606277466, "logits/rejected": -1.905413269996643, "logps/chosen": -52.22394943237305, "logps/rejected": -83.0799331665039, "loss": 0.3241, "rewards/accuracies": 1.0, "rewards/chosen": 3.878824234008789, "rewards/margins": 0.3687152862548828, "rewards/rejected": 3.5101089477539062, "step": 7584 }, { "epoch": 1.68, "learning_rate": 6.612360761034187e-07, "logits/chosen": -1.764193058013916, "logits/rejected": -1.764193058013916, "logps/chosen": -66.93988037109375, "logps/rejected": -66.93988037109375, "loss": 0.3565, "rewards/accuracies": 0.0, "rewards/chosen": 8.62967586517334, "rewards/margins": 0.0, "rewards/rejected": 8.62967586517334, "step": 7585 }, { "epoch": 1.68, "learning_rate": 6.603455748527038e-07, "logits/chosen": -1.8007051944732666, "logits/rejected": -1.8047524690628052, "logps/chosen": -33.353782653808594, "logps/rejected": -68.16897583007812, "loss": 0.3114, "rewards/accuracies": 1.0, "rewards/chosen": 3.9534950256347656, "rewards/margins": 0.2827141284942627, "rewards/rejected": 3.670780897140503, "step": 7586 }, { "epoch": 1.68, "learning_rate": 6.594556312371487e-07, "logits/chosen": -1.798167109489441, "logits/rejected": -1.202748417854309, "logps/chosen": -23.337093353271484, "logps/rejected": -49.295570373535156, "loss": 0.1185, "rewards/accuracies": 1.0, "rewards/chosen": 3.2363369464874268, "rewards/margins": 1.3610104322433472, "rewards/rejected": 1.8753265142440796, "step": 7587 }, { "epoch": 1.68, "learning_rate": 6.585662453711122e-07, "logits/chosen": -1.9127191305160522, "logits/rejected": -1.9180270433425903, "logps/chosen": -26.858808517456055, "logps/rejected": -63.04121780395508, "loss": 0.8921, "rewards/accuracies": 0.0, "rewards/chosen": 4.083950996398926, "rewards/margins": -1.2315478324890137, "rewards/rejected": 5.3154988288879395, "step": 7588 }, { "epoch": 1.68, "learning_rate": 6.576774173688754e-07, "logits/chosen": -2.0222275257110596, "logits/rejected": -2.113164186477661, "logps/chosen": -62.686378479003906, "logps/rejected": -81.77076721191406, "loss": 2.4289, "rewards/accuracies": 0.0, "rewards/chosen": 5.897725582122803, "rewards/margins": -4.83618688583374, "rewards/rejected": 10.733912467956543, "step": 7589 }, { "epoch": 1.68, "learning_rate": 6.567891473446525e-07, "logits/chosen": -1.7365549802780151, "logits/rejected": -1.693723201751709, "logps/chosen": -29.680328369140625, "logps/rejected": -47.715511322021484, "loss": 0.4937, "rewards/accuracies": 1.0, "rewards/chosen": 2.975444793701172, "rewards/margins": 0.3548405170440674, "rewards/rejected": 2.6206042766571045, "step": 7590 }, { "epoch": 1.68, "learning_rate": 6.559014354125831e-07, "logits/chosen": -1.7026182413101196, "logits/rejected": -1.659675121307373, "logps/chosen": -31.390872955322266, "logps/rejected": -57.69359588623047, "loss": 0.3421, "rewards/accuracies": 1.0, "rewards/chosen": 3.148911714553833, "rewards/margins": 0.027669191360473633, "rewards/rejected": 3.1212425231933594, "step": 7591 }, { "epoch": 1.68, "learning_rate": 6.550142816867366e-07, "logits/chosen": -2.0027928352355957, "logits/rejected": -2.0027928352355957, "logps/chosen": -27.112144470214844, "logps/rejected": -27.112144470214844, "loss": 0.3773, "rewards/accuracies": 0.0, "rewards/chosen": 5.101714611053467, "rewards/margins": 0.0, "rewards/rejected": 5.101714611053467, "step": 7592 }, { "epoch": 1.68, "learning_rate": 6.541276862811091e-07, "logits/chosen": -1.746904969215393, "logits/rejected": -1.6927642822265625, "logps/chosen": -62.762516021728516, "logps/rejected": -83.32592010498047, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 5.757954120635986, "rewards/margins": 1.4212093353271484, "rewards/rejected": 4.336744785308838, "step": 7593 }, { "epoch": 1.68, "learning_rate": 6.532416493096272e-07, "logits/chosen": -2.202608108520508, "logits/rejected": -2.175816297531128, "logps/chosen": -43.06134796142578, "logps/rejected": -59.85645294189453, "loss": 0.2795, "rewards/accuracies": 1.0, "rewards/chosen": 4.196940898895264, "rewards/margins": 0.34932732582092285, "rewards/rejected": 3.847613573074341, "step": 7594 }, { "epoch": 1.68, "learning_rate": 6.523561708861454e-07, "logits/chosen": -1.941148042678833, "logits/rejected": -1.941024899482727, "logps/chosen": -51.219215393066406, "logps/rejected": -79.58878326416016, "loss": 0.5968, "rewards/accuracies": 0.0, "rewards/chosen": 5.562152862548828, "rewards/margins": -0.8282852172851562, "rewards/rejected": 6.390438079833984, "step": 7595 }, { "epoch": 1.68, "learning_rate": 6.51471251124442e-07, "logits/chosen": -2.087967872619629, "logits/rejected": -2.0713119506835938, "logps/chosen": -92.79965209960938, "logps/rejected": -80.5886459350586, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": 8.947306632995605, "rewards/margins": 3.4385929107666016, "rewards/rejected": 5.508713722229004, "step": 7596 }, { "epoch": 1.68, "learning_rate": 6.505868901382312e-07, "logits/chosen": -1.8664586544036865, "logits/rejected": -1.8280915021896362, "logps/chosen": -65.19784545898438, "logps/rejected": -90.51432800292969, "loss": 0.1029, "rewards/accuracies": 1.0, "rewards/chosen": 7.525332927703857, "rewards/margins": 1.7154192924499512, "rewards/rejected": 5.809913635253906, "step": 7597 }, { "epoch": 1.68, "learning_rate": 6.49703088041147e-07, "logits/chosen": -1.7838791608810425, "logits/rejected": -1.7660242319107056, "logps/chosen": -32.26306915283203, "logps/rejected": -31.524930953979492, "loss": 0.3051, "rewards/accuracies": 1.0, "rewards/chosen": 1.9643394947052002, "rewards/margins": 0.917837381362915, "rewards/rejected": 1.0465021133422852, "step": 7598 }, { "epoch": 1.68, "learning_rate": 6.488198449467603e-07, "logits/chosen": -1.8007200956344604, "logits/rejected": -1.7401772737503052, "logps/chosen": -84.76800537109375, "logps/rejected": -60.582977294921875, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 7.837674140930176, "rewards/margins": 3.975787401199341, "rewards/rejected": 3.861886739730835, "step": 7599 }, { "epoch": 1.68, "learning_rate": 6.479371609685625e-07, "logits/chosen": -1.8754632472991943, "logits/rejected": -1.731776475906372, "logps/chosen": -85.30108642578125, "logps/rejected": -5.4813995361328125, "loss": 0.1149, "rewards/accuracies": 1.0, "rewards/chosen": 7.396228313446045, "rewards/margins": 6.1981964111328125, "rewards/rejected": 1.1980316638946533, "step": 7600 }, { "epoch": 1.68, "learning_rate": 6.470550362199779e-07, "logits/chosen": -1.705930233001709, "logits/rejected": -1.6815544366836548, "logps/chosen": -109.22925567626953, "logps/rejected": -67.21222686767578, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 6.962325572967529, "rewards/margins": 4.29454231262207, "rewards/rejected": 2.667783498764038, "step": 7601 }, { "epoch": 1.68, "learning_rate": 6.461734708143574e-07, "logits/chosen": -1.8745737075805664, "logits/rejected": -1.8745737075805664, "logps/chosen": -21.285755157470703, "logps/rejected": -21.285755157470703, "loss": 0.4258, "rewards/accuracies": 0.0, "rewards/chosen": 3.280719518661499, "rewards/margins": 0.0, "rewards/rejected": 3.280719518661499, "step": 7602 }, { "epoch": 1.68, "learning_rate": 6.452924648649777e-07, "logits/chosen": -1.7245945930480957, "logits/rejected": -1.6926015615463257, "logps/chosen": -35.81442642211914, "logps/rejected": -41.15367126464844, "loss": 0.0997, "rewards/accuracies": 1.0, "rewards/chosen": 3.353426694869995, "rewards/margins": 2.029956817626953, "rewards/rejected": 1.3234699964523315, "step": 7603 }, { "epoch": 1.68, "learning_rate": 6.444120184850506e-07, "logits/chosen": -2.094939708709717, "logits/rejected": -1.9621516466140747, "logps/chosen": -60.91886901855469, "logps/rejected": -53.31886672973633, "loss": 0.2229, "rewards/accuracies": 1.0, "rewards/chosen": 3.4601516723632812, "rewards/margins": 0.6472728252410889, "rewards/rejected": 2.8128788471221924, "step": 7604 }, { "epoch": 1.68, "learning_rate": 6.435321317877057e-07, "logits/chosen": -1.8521445989608765, "logits/rejected": -1.8474516868591309, "logps/chosen": -42.979164123535156, "logps/rejected": -53.43072509765625, "loss": 0.9381, "rewards/accuracies": 1.0, "rewards/chosen": 4.524608135223389, "rewards/margins": 0.5641496181488037, "rewards/rejected": 3.960458517074585, "step": 7605 }, { "epoch": 1.68, "learning_rate": 6.426528048860114e-07, "logits/chosen": -2.0168607234954834, "logits/rejected": -1.6155173778533936, "logps/chosen": -44.05476760864258, "logps/rejected": -37.902984619140625, "loss": 1.4283, "rewards/accuracies": 1.0, "rewards/chosen": 3.678915023803711, "rewards/margins": 1.6070036888122559, "rewards/rejected": 2.071911334991455, "step": 7606 }, { "epoch": 1.68, "learning_rate": 6.417740378929549e-07, "logits/chosen": -1.91480553150177, "logits/rejected": -1.8062694072723389, "logps/chosen": -46.512882232666016, "logps/rejected": -14.10856819152832, "loss": 0.088, "rewards/accuracies": 1.0, "rewards/chosen": 3.3771932125091553, "rewards/margins": 1.8662017583847046, "rewards/rejected": 1.5109914541244507, "step": 7607 }, { "epoch": 1.68, "learning_rate": 6.408958309214597e-07, "logits/chosen": -1.943540096282959, "logits/rejected": -1.556794285774231, "logps/chosen": -113.27137756347656, "logps/rejected": -39.44763946533203, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 9.524968147277832, "rewards/margins": 4.572396755218506, "rewards/rejected": 4.952571392059326, "step": 7608 }, { "epoch": 1.68, "learning_rate": 6.400181840843705e-07, "logits/chosen": -1.7364189624786377, "logits/rejected": -1.7455344200134277, "logps/chosen": -36.23394012451172, "logps/rejected": -37.17763137817383, "loss": 0.2365, "rewards/accuracies": 1.0, "rewards/chosen": 4.233609199523926, "rewards/margins": 0.7009222507476807, "rewards/rejected": 3.532686948776245, "step": 7609 }, { "epoch": 1.68, "learning_rate": 6.391410974944634e-07, "logits/chosen": -1.9050880670547485, "logits/rejected": -1.94109046459198, "logps/chosen": -78.42774963378906, "logps/rejected": -74.37843322753906, "loss": 0.6727, "rewards/accuracies": 0.0, "rewards/chosen": 6.2843217849731445, "rewards/margins": -0.8999876976013184, "rewards/rejected": 7.184309482574463, "step": 7610 }, { "epoch": 1.68, "learning_rate": 6.382645712644431e-07, "logits/chosen": -1.9667164087295532, "logits/rejected": -1.9989826679229736, "logps/chosen": -68.0728988647461, "logps/rejected": -83.62800598144531, "loss": 0.1933, "rewards/accuracies": 1.0, "rewards/chosen": 8.392080307006836, "rewards/margins": 1.1477532386779785, "rewards/rejected": 7.244327068328857, "step": 7611 }, { "epoch": 1.68, "learning_rate": 6.373886055069378e-07, "logits/chosen": -1.9649012088775635, "logits/rejected": -1.9503060579299927, "logps/chosen": -66.21894836425781, "logps/rejected": -46.49027633666992, "loss": 0.3394, "rewards/accuracies": 1.0, "rewards/chosen": 3.339444875717163, "rewards/margins": 0.04618120193481445, "rewards/rejected": 3.2932636737823486, "step": 7612 }, { "epoch": 1.69, "learning_rate": 6.365132003345115e-07, "logits/chosen": -1.8951644897460938, "logits/rejected": -1.9497958421707153, "logps/chosen": -86.88134002685547, "logps/rejected": -198.5571746826172, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": 11.331072807312012, "rewards/margins": 2.985828399658203, "rewards/rejected": 8.345244407653809, "step": 7613 }, { "epoch": 1.69, "learning_rate": 6.356383558596469e-07, "logits/chosen": -1.9370391368865967, "logits/rejected": -1.8306032419204712, "logps/chosen": -153.51089477539062, "logps/rejected": -46.426456451416016, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 6.759359836578369, "rewards/margins": 4.408868789672852, "rewards/rejected": 2.3504910469055176, "step": 7614 }, { "epoch": 1.69, "learning_rate": 6.347640721947646e-07, "logits/chosen": -2.172283887863159, "logits/rejected": -2.158586263656616, "logps/chosen": -89.00895690917969, "logps/rejected": -46.71855163574219, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": 5.361085414886475, "rewards/margins": 2.540146589279175, "rewards/rejected": 2.8209388256073, "step": 7615 }, { "epoch": 1.69, "learning_rate": 6.338903494522031e-07, "logits/chosen": -1.899731159210205, "logits/rejected": -1.904736876487732, "logps/chosen": -62.328514099121094, "logps/rejected": -76.17820739746094, "loss": 0.5109, "rewards/accuracies": 1.0, "rewards/chosen": 2.5702340602874756, "rewards/margins": 0.19302606582641602, "rewards/rejected": 2.3772079944610596, "step": 7616 }, { "epoch": 1.69, "learning_rate": 6.330171877442376e-07, "logits/chosen": -2.0903007984161377, "logits/rejected": -2.0947134494781494, "logps/chosen": -39.34247589111328, "logps/rejected": -219.2996063232422, "loss": 1.6981, "rewards/accuracies": 0.0, "rewards/chosen": 5.781748294830322, "rewards/margins": -3.2897439002990723, "rewards/rejected": 9.071492195129395, "step": 7617 }, { "epoch": 1.69, "learning_rate": 6.321445871830645e-07, "logits/chosen": -2.0456154346466064, "logits/rejected": -1.998014211654663, "logps/chosen": -71.33500671386719, "logps/rejected": -141.05722045898438, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 11.363029479980469, "rewards/margins": 3.213374137878418, "rewards/rejected": 8.14965534210205, "step": 7618 }, { "epoch": 1.69, "learning_rate": 6.312725478808123e-07, "logits/chosen": -1.8652877807617188, "logits/rejected": -1.8652877807617188, "logps/chosen": -72.32518768310547, "logps/rejected": -72.32518768310547, "loss": 0.3477, "rewards/accuracies": 0.0, "rewards/chosen": 4.679067134857178, "rewards/margins": 0.0, "rewards/rejected": 4.679067134857178, "step": 7619 }, { "epoch": 1.69, "learning_rate": 6.304010699495366e-07, "logits/chosen": -1.9069092273712158, "logits/rejected": -1.8564890623092651, "logps/chosen": -97.9571762084961, "logps/rejected": -135.68521118164062, "loss": 0.1624, "rewards/accuracies": 1.0, "rewards/chosen": 8.434128761291504, "rewards/margins": 2.6894845962524414, "rewards/rejected": 5.7446441650390625, "step": 7620 }, { "epoch": 1.69, "learning_rate": 6.295301535012166e-07, "logits/chosen": -1.8807085752487183, "logits/rejected": -1.9293016195297241, "logps/chosen": -40.68218994140625, "logps/rejected": -75.92332458496094, "loss": 0.2235, "rewards/accuracies": 1.0, "rewards/chosen": 4.346097469329834, "rewards/margins": 0.7649710178375244, "rewards/rejected": 3.5811264514923096, "step": 7621 }, { "epoch": 1.69, "learning_rate": 6.286597986477683e-07, "logits/chosen": -1.8746449947357178, "logits/rejected": -1.803673267364502, "logps/chosen": -45.97208786010742, "logps/rejected": -27.119592666625977, "loss": 0.13, "rewards/accuracies": 1.0, "rewards/chosen": 6.030139446258545, "rewards/margins": 1.220461368560791, "rewards/rejected": 4.809678077697754, "step": 7622 }, { "epoch": 1.69, "learning_rate": 6.27790005501025e-07, "logits/chosen": -1.8850202560424805, "logits/rejected": -1.8551115989685059, "logps/chosen": -89.71888732910156, "logps/rejected": -51.90283966064453, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 7.993827819824219, "rewards/margins": 3.187089443206787, "rewards/rejected": 4.806738376617432, "step": 7623 }, { "epoch": 1.69, "learning_rate": 6.269207741727578e-07, "logits/chosen": -1.9600754976272583, "logits/rejected": -1.9514156579971313, "logps/chosen": -42.10034942626953, "logps/rejected": -45.11258316040039, "loss": 0.5237, "rewards/accuracies": 1.0, "rewards/chosen": 4.2361321449279785, "rewards/margins": 1.4734477996826172, "rewards/rejected": 2.7626843452453613, "step": 7624 }, { "epoch": 1.69, "learning_rate": 6.260521047746571e-07, "logits/chosen": -1.9988943338394165, "logits/rejected": -2.0167288780212402, "logps/chosen": -51.199371337890625, "logps/rejected": -67.39736938476562, "loss": 1.0033, "rewards/accuracies": 0.0, "rewards/chosen": 3.722801923751831, "rewards/margins": -1.849938154220581, "rewards/rejected": 5.572740077972412, "step": 7625 }, { "epoch": 1.69, "learning_rate": 6.251839974183465e-07, "logits/chosen": -1.9661873579025269, "logits/rejected": -1.9911668300628662, "logps/chosen": -49.175384521484375, "logps/rejected": -31.01605224609375, "loss": 0.2858, "rewards/accuracies": 1.0, "rewards/chosen": 3.6598801612854004, "rewards/margins": 0.26070570945739746, "rewards/rejected": 3.399174451828003, "step": 7626 }, { "epoch": 1.69, "learning_rate": 6.243164522153756e-07, "logits/chosen": -2.0872132778167725, "logits/rejected": -2.0513434410095215, "logps/chosen": -36.02595520019531, "logps/rejected": -42.22283172607422, "loss": 0.0716, "rewards/accuracies": 1.0, "rewards/chosen": 3.67252516746521, "rewards/margins": 2.3380627632141113, "rewards/rejected": 1.3344624042510986, "step": 7627 }, { "epoch": 1.69, "learning_rate": 6.234494692772214e-07, "logits/chosen": -2.040349245071411, "logits/rejected": -1.9847811460494995, "logps/chosen": -103.25469207763672, "logps/rejected": -130.64642333984375, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": 9.840203285217285, "rewards/margins": 3.04927921295166, "rewards/rejected": 6.790924072265625, "step": 7628 }, { "epoch": 1.69, "learning_rate": 6.225830487152895e-07, "logits/chosen": -2.1559536457061768, "logits/rejected": -2.1327877044677734, "logps/chosen": -41.83324432373047, "logps/rejected": -69.82211303710938, "loss": 0.1844, "rewards/accuracies": 1.0, "rewards/chosen": 3.7697227001190186, "rewards/margins": 1.2277412414550781, "rewards/rejected": 2.5419814586639404, "step": 7629 }, { "epoch": 1.69, "learning_rate": 6.217171906409131e-07, "logits/chosen": -1.7315559387207031, "logits/rejected": -1.7079691886901855, "logps/chosen": -92.32746887207031, "logps/rejected": -8.057920455932617, "loss": 0.1133, "rewards/accuracies": 1.0, "rewards/chosen": 2.8449294567108154, "rewards/margins": 1.5069177150726318, "rewards/rejected": 1.3380117416381836, "step": 7630 }, { "epoch": 1.69, "learning_rate": 6.208518951653536e-07, "logits/chosen": -1.5496643781661987, "logits/rejected": -1.5200053453445435, "logps/chosen": -46.58103942871094, "logps/rejected": -64.69277954101562, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 4.901363372802734, "rewards/margins": 2.7347640991210938, "rewards/rejected": 2.1665992736816406, "step": 7631 }, { "epoch": 1.69, "learning_rate": 6.199871623997961e-07, "logits/chosen": -2.0104427337646484, "logits/rejected": -1.9711339473724365, "logps/chosen": -62.127174377441406, "logps/rejected": -51.522377014160156, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": 4.787513256072998, "rewards/margins": 2.600393056869507, "rewards/rejected": 2.187120199203491, "step": 7632 }, { "epoch": 1.69, "learning_rate": 6.191229924553615e-07, "logits/chosen": -2.1357998847961426, "logits/rejected": -2.1357998847961426, "logps/chosen": -38.06421661376953, "logps/rejected": -38.06421661376953, "loss": 0.4298, "rewards/accuracies": 0.0, "rewards/chosen": 8.410611152648926, "rewards/margins": 0.0, "rewards/rejected": 8.410611152648926, "step": 7633 }, { "epoch": 1.69, "learning_rate": 6.182593854430896e-07, "logits/chosen": -2.02948260307312, "logits/rejected": -1.9885368347167969, "logps/chosen": -115.25497436523438, "logps/rejected": -62.22797393798828, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": 8.017953872680664, "rewards/margins": 1.9698686599731445, "rewards/rejected": 6.0480852127075195, "step": 7634 }, { "epoch": 1.69, "learning_rate": 6.173963414739531e-07, "logits/chosen": -1.8805135488510132, "logits/rejected": -1.8685593605041504, "logps/chosen": -37.284549713134766, "logps/rejected": -75.53335571289062, "loss": 0.2668, "rewards/accuracies": 1.0, "rewards/chosen": 3.877474546432495, "rewards/margins": 1.8411006927490234, "rewards/rejected": 2.0363738536834717, "step": 7635 }, { "epoch": 1.69, "learning_rate": 6.165338606588517e-07, "logits/chosen": -1.9396127462387085, "logits/rejected": -1.9334620237350464, "logps/chosen": -28.251115798950195, "logps/rejected": -18.325260162353516, "loss": 0.4041, "rewards/accuracies": 1.0, "rewards/chosen": 2.76735782623291, "rewards/margins": 0.29842662811279297, "rewards/rejected": 2.468931198120117, "step": 7636 }, { "epoch": 1.69, "learning_rate": 6.156719431086111e-07, "logits/chosen": -1.9771209955215454, "logits/rejected": -1.9771209955215454, "logps/chosen": -34.98865509033203, "logps/rejected": -34.98865509033203, "loss": 0.3665, "rewards/accuracies": 0.0, "rewards/chosen": 3.9451210498809814, "rewards/margins": 0.0, "rewards/rejected": 3.9451210498809814, "step": 7637 }, { "epoch": 1.69, "learning_rate": 6.148105889339867e-07, "logits/chosen": -1.661818027496338, "logits/rejected": -1.661818027496338, "logps/chosen": -12.933115005493164, "logps/rejected": -12.933115005493164, "loss": 0.3501, "rewards/accuracies": 0.0, "rewards/chosen": 2.1747686862945557, "rewards/margins": 0.0, "rewards/rejected": 2.1747686862945557, "step": 7638 }, { "epoch": 1.69, "learning_rate": 6.139497982456593e-07, "logits/chosen": -1.9840682744979858, "logits/rejected": -2.0060553550720215, "logps/chosen": -45.41682052612305, "logps/rejected": -34.662445068359375, "loss": 0.2334, "rewards/accuracies": 1.0, "rewards/chosen": 4.326749324798584, "rewards/margins": 0.6863837242126465, "rewards/rejected": 3.6403656005859375, "step": 7639 }, { "epoch": 1.69, "learning_rate": 6.130895711542406e-07, "logits/chosen": -2.0183229446411133, "logits/rejected": -1.9243049621582031, "logps/chosen": -73.00704193115234, "logps/rejected": -149.06539916992188, "loss": 0.0751, "rewards/accuracies": 1.0, "rewards/chosen": 9.051481246948242, "rewards/margins": 2.538207530975342, "rewards/rejected": 6.5132737159729, "step": 7640 }, { "epoch": 1.69, "learning_rate": 6.122299077702642e-07, "logits/chosen": -1.8411802053451538, "logits/rejected": -1.6486722230911255, "logps/chosen": -168.4827117919922, "logps/rejected": -25.30085563659668, "loss": 0.1393, "rewards/accuracies": 1.0, "rewards/chosen": 5.544830322265625, "rewards/margins": 4.138871192932129, "rewards/rejected": 1.405959129333496, "step": 7641 }, { "epoch": 1.69, "learning_rate": 6.113708082041985e-07, "logits/chosen": -1.913187026977539, "logits/rejected": -1.5541843175888062, "logps/chosen": -73.36161804199219, "logps/rejected": -105.29944610595703, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 6.369287014007568, "rewards/margins": 1.661478042602539, "rewards/rejected": 4.707808971405029, "step": 7642 }, { "epoch": 1.69, "learning_rate": 6.105122725664331e-07, "logits/chosen": -1.6729648113250732, "logits/rejected": -1.674354076385498, "logps/chosen": -61.7224006652832, "logps/rejected": -69.7712173461914, "loss": 0.1289, "rewards/accuracies": 1.0, "rewards/chosen": 3.912782669067383, "rewards/margins": 1.2337284088134766, "rewards/rejected": 2.6790542602539062, "step": 7643 }, { "epoch": 1.69, "learning_rate": 6.096543009672884e-07, "logits/chosen": -1.9693763256072998, "logits/rejected": -1.8407913446426392, "logps/chosen": -67.87319946289062, "logps/rejected": -18.551647186279297, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 3.8890137672424316, "rewards/margins": 3.8425722122192383, "rewards/rejected": 0.04644165188074112, "step": 7644 }, { "epoch": 1.69, "learning_rate": 6.087968935170125e-07, "logits/chosen": -1.986909031867981, "logits/rejected": -1.8030773401260376, "logps/chosen": -118.45506286621094, "logps/rejected": -42.41163635253906, "loss": 0.069, "rewards/accuracies": 1.0, "rewards/chosen": 5.777693271636963, "rewards/margins": 2.0987160205841064, "rewards/rejected": 3.6789772510528564, "step": 7645 }, { "epoch": 1.69, "learning_rate": 6.079400503257788e-07, "logits/chosen": -2.102851152420044, "logits/rejected": -2.1108086109161377, "logps/chosen": -44.76423645019531, "logps/rejected": -69.91154479980469, "loss": 1.1924, "rewards/accuracies": 0.0, "rewards/chosen": 3.5924606323242188, "rewards/margins": -1.488166332244873, "rewards/rejected": 5.080626964569092, "step": 7646 }, { "epoch": 1.69, "learning_rate": 6.070837715036909e-07, "logits/chosen": -1.8913551568984985, "logits/rejected": -1.8777341842651367, "logps/chosen": -42.794334411621094, "logps/rejected": -49.415306091308594, "loss": 0.1929, "rewards/accuracies": 1.0, "rewards/chosen": 3.584487199783325, "rewards/margins": 0.8220489025115967, "rewards/rejected": 2.7624382972717285, "step": 7647 }, { "epoch": 1.69, "learning_rate": 6.062280571607781e-07, "logits/chosen": -2.073920726776123, "logits/rejected": -1.9807369709014893, "logps/chosen": -128.29209899902344, "logps/rejected": -53.982303619384766, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 7.315820217132568, "rewards/margins": 4.871461868286133, "rewards/rejected": 2.4443585872650146, "step": 7648 }, { "epoch": 1.69, "learning_rate": 6.053729074069975e-07, "logits/chosen": -2.3419885635375977, "logits/rejected": -2.3601715564727783, "logps/chosen": -43.16755676269531, "logps/rejected": -47.42074203491211, "loss": 0.1433, "rewards/accuracies": 1.0, "rewards/chosen": 4.550201416015625, "rewards/margins": 2.1755754947662354, "rewards/rejected": 2.3746259212493896, "step": 7649 }, { "epoch": 1.69, "learning_rate": 6.045183223522339e-07, "logits/chosen": -1.798723578453064, "logits/rejected": -1.733630895614624, "logps/chosen": -56.33027648925781, "logps/rejected": -44.20211410522461, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 4.554091930389404, "rewards/margins": 2.283344030380249, "rewards/rejected": 2.2707479000091553, "step": 7650 }, { "epoch": 1.69, "learning_rate": 6.036643021063004e-07, "logits/chosen": -2.0419723987579346, "logits/rejected": -1.9822170734405518, "logps/chosen": -70.94316101074219, "logps/rejected": -38.79363250732422, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 8.638056755065918, "rewards/margins": 4.511514663696289, "rewards/rejected": 4.126542091369629, "step": 7651 }, { "epoch": 1.69, "learning_rate": 6.028108467789351e-07, "logits/chosen": -1.7898916006088257, "logits/rejected": -1.7451443672180176, "logps/chosen": -27.913692474365234, "logps/rejected": -34.73799133300781, "loss": 0.2905, "rewards/accuracies": 1.0, "rewards/chosen": 3.1567113399505615, "rewards/margins": 0.23868608474731445, "rewards/rejected": 2.918025255203247, "step": 7652 }, { "epoch": 1.69, "learning_rate": 6.019579564798045e-07, "logits/chosen": -1.9531004428863525, "logits/rejected": -2.017537832260132, "logps/chosen": -64.51264190673828, "logps/rejected": -90.36505126953125, "loss": 3.3167, "rewards/accuracies": 0.0, "rewards/chosen": 3.2351250648498535, "rewards/margins": -6.580838680267334, "rewards/rejected": 9.815963745117188, "step": 7653 }, { "epoch": 1.69, "learning_rate": 6.011056313185049e-07, "logits/chosen": -1.9063549041748047, "logits/rejected": -1.9150192737579346, "logps/chosen": -81.89775848388672, "logps/rejected": -87.20567321777344, "loss": 0.4137, "rewards/accuracies": 0.0, "rewards/chosen": 10.82242202758789, "rewards/margins": -0.25159740447998047, "rewards/rejected": 11.074019432067871, "step": 7654 }, { "epoch": 1.69, "learning_rate": 6.002538714045563e-07, "logits/chosen": -1.9599356651306152, "logits/rejected": -1.908368468284607, "logps/chosen": -53.16741180419922, "logps/rejected": -94.69239807128906, "loss": 0.14, "rewards/accuracies": 1.0, "rewards/chosen": 6.463818550109863, "rewards/margins": 1.43226957321167, "rewards/rejected": 5.031548976898193, "step": 7655 }, { "epoch": 1.69, "learning_rate": 5.994026768474087e-07, "logits/chosen": -1.9594767093658447, "logits/rejected": -1.8879461288452148, "logps/chosen": -38.99169921875, "logps/rejected": -44.179874420166016, "loss": 0.4851, "rewards/accuracies": 1.0, "rewards/chosen": 3.948237657546997, "rewards/margins": 0.11009788513183594, "rewards/rejected": 3.838139772415161, "step": 7656 }, { "epoch": 1.69, "learning_rate": 5.985520477564383e-07, "logits/chosen": -1.6637908220291138, "logits/rejected": -1.7304891347885132, "logps/chosen": -194.35133361816406, "logps/rejected": -46.27805709838867, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 6.642014980316162, "rewards/margins": 4.148500442504883, "rewards/rejected": 2.4935147762298584, "step": 7657 }, { "epoch": 1.69, "learning_rate": 5.977019842409492e-07, "logits/chosen": -1.480899691581726, "logits/rejected": -1.2777870893478394, "logps/chosen": -72.61201477050781, "logps/rejected": -20.556013107299805, "loss": 0.4648, "rewards/accuracies": 1.0, "rewards/chosen": 2.1450860500335693, "rewards/margins": 1.276097059249878, "rewards/rejected": 0.8689889907836914, "step": 7658 }, { "epoch": 1.7, "learning_rate": 5.968524864101722e-07, "logits/chosen": -2.285912275314331, "logits/rejected": -2.257469892501831, "logps/chosen": -70.48124694824219, "logps/rejected": -38.07215118408203, "loss": 0.2076, "rewards/accuracies": 1.0, "rewards/chosen": 3.0483644008636475, "rewards/margins": 1.9538391828536987, "rewards/rejected": 1.0945252180099487, "step": 7659 }, { "epoch": 1.7, "learning_rate": 5.960035543732656e-07, "logits/chosen": -2.0077896118164062, "logits/rejected": -2.0077896118164062, "logps/chosen": -16.671871185302734, "logps/rejected": -16.671871185302734, "loss": 0.3481, "rewards/accuracies": 0.0, "rewards/chosen": 1.5896061658859253, "rewards/margins": 0.0, "rewards/rejected": 1.5896061658859253, "step": 7660 }, { "epoch": 1.7, "learning_rate": 5.95155188239317e-07, "logits/chosen": -2.033196449279785, "logits/rejected": -1.9972065687179565, "logps/chosen": -36.583213806152344, "logps/rejected": -42.253719329833984, "loss": 0.1855, "rewards/accuracies": 1.0, "rewards/chosen": 5.661717414855957, "rewards/margins": 0.8576445579528809, "rewards/rejected": 4.804072856903076, "step": 7661 }, { "epoch": 1.7, "learning_rate": 5.943073881173362e-07, "logits/chosen": -1.5511952638626099, "logits/rejected": -1.5595991611480713, "logps/chosen": -51.62260818481445, "logps/rejected": -45.987640380859375, "loss": 0.7261, "rewards/accuracies": 1.0, "rewards/chosen": 4.458505153656006, "rewards/margins": 2.206555128097534, "rewards/rejected": 2.2519500255584717, "step": 7662 }, { "epoch": 1.7, "learning_rate": 5.934601541162649e-07, "logits/chosen": -1.8512718677520752, "logits/rejected": -1.8512718677520752, "logps/chosen": -7.162416934967041, "logps/rejected": -7.162416934967041, "loss": 0.3981, "rewards/accuracies": 0.0, "rewards/chosen": 3.1441946029663086, "rewards/margins": 0.0, "rewards/rejected": 3.1441946029663086, "step": 7663 }, { "epoch": 1.7, "learning_rate": 5.926134863449712e-07, "logits/chosen": -1.9638901948928833, "logits/rejected": -1.9638901948928833, "logps/chosen": -47.103843688964844, "logps/rejected": -47.103843688964844, "loss": 0.3644, "rewards/accuracies": 0.0, "rewards/chosen": 4.3459320068359375, "rewards/margins": 0.0, "rewards/rejected": 4.3459320068359375, "step": 7664 }, { "epoch": 1.7, "learning_rate": 5.91767384912249e-07, "logits/chosen": -1.7717841863632202, "logits/rejected": -1.7493700981140137, "logps/chosen": -85.29061889648438, "logps/rejected": -107.4283676147461, "loss": 0.1431, "rewards/accuracies": 1.0, "rewards/chosen": 4.046929359436035, "rewards/margins": 1.3326706886291504, "rewards/rejected": 2.7142586708068848, "step": 7665 }, { "epoch": 1.7, "learning_rate": 5.90921849926821e-07, "logits/chosen": -2.0686230659484863, "logits/rejected": -2.052222490310669, "logps/chosen": -122.10354614257812, "logps/rejected": -94.083740234375, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 6.5760817527771, "rewards/margins": 4.438634872436523, "rewards/rejected": 2.137446641921997, "step": 7666 }, { "epoch": 1.7, "learning_rate": 5.900768814973362e-07, "logits/chosen": -1.807376742362976, "logits/rejected": -1.7857595682144165, "logps/chosen": -58.970245361328125, "logps/rejected": -50.52593994140625, "loss": 0.1415, "rewards/accuracies": 1.0, "rewards/chosen": 5.252815246582031, "rewards/margins": 1.182908535003662, "rewards/rejected": 4.069906711578369, "step": 7667 }, { "epoch": 1.7, "learning_rate": 5.892324797323711e-07, "logits/chosen": -1.9491163492202759, "logits/rejected": -1.8578517436981201, "logps/chosen": -70.27450561523438, "logps/rejected": -76.09918212890625, "loss": 0.0696, "rewards/accuracies": 1.0, "rewards/chosen": 7.422168254852295, "rewards/margins": 2.4401779174804688, "rewards/rejected": 4.981990337371826, "step": 7668 }, { "epoch": 1.7, "learning_rate": 5.883886447404291e-07, "logits/chosen": -1.8766742944717407, "logits/rejected": -1.8580496311187744, "logps/chosen": -30.209789276123047, "logps/rejected": -73.30796813964844, "loss": 0.8884, "rewards/accuracies": 0.0, "rewards/chosen": 2.790621519088745, "rewards/margins": -1.1608891487121582, "rewards/rejected": 3.9515106678009033, "step": 7669 }, { "epoch": 1.7, "learning_rate": 5.875453766299416e-07, "logits/chosen": -1.8435741662979126, "logits/rejected": -1.5702804327011108, "logps/chosen": -184.1251678466797, "logps/rejected": -14.035642623901367, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 7.834205627441406, "rewards/margins": 7.455805778503418, "rewards/rejected": 0.378400057554245, "step": 7670 }, { "epoch": 1.7, "learning_rate": 5.867026755092653e-07, "logits/chosen": -2.2374086380004883, "logits/rejected": -2.1949617862701416, "logps/chosen": -52.94719314575195, "logps/rejected": -48.00523376464844, "loss": 0.1877, "rewards/accuracies": 1.0, "rewards/chosen": 3.2126972675323486, "rewards/margins": 1.9566887617111206, "rewards/rejected": 1.256008505821228, "step": 7671 }, { "epoch": 1.7, "learning_rate": 5.858605414866858e-07, "logits/chosen": -1.979385495185852, "logits/rejected": -1.9689100980758667, "logps/chosen": -34.873146057128906, "logps/rejected": -41.109161376953125, "loss": 0.6452, "rewards/accuracies": 1.0, "rewards/chosen": 2.8016936779022217, "rewards/margins": 1.0387457609176636, "rewards/rejected": 1.762947916984558, "step": 7672 }, { "epoch": 1.7, "learning_rate": 5.850189746704155e-07, "logits/chosen": -2.0566186904907227, "logits/rejected": -2.0525197982788086, "logps/chosen": -53.09641647338867, "logps/rejected": -39.800941467285156, "loss": 0.2618, "rewards/accuracies": 1.0, "rewards/chosen": 4.8062872886657715, "rewards/margins": 0.7035322189331055, "rewards/rejected": 4.102755069732666, "step": 7673 }, { "epoch": 1.7, "learning_rate": 5.841779751685939e-07, "logits/chosen": -2.1560254096984863, "logits/rejected": -2.1615633964538574, "logps/chosen": -71.9474105834961, "logps/rejected": -78.52804565429688, "loss": 1.118, "rewards/accuracies": 0.0, "rewards/chosen": 9.122836112976074, "rewards/margins": -2.0732154846191406, "rewards/rejected": 11.196051597595215, "step": 7674 }, { "epoch": 1.7, "learning_rate": 5.833375430892868e-07, "logits/chosen": -1.792803406715393, "logits/rejected": -1.807707667350769, "logps/chosen": -54.430389404296875, "logps/rejected": -103.7529296875, "loss": 0.3197, "rewards/accuracies": 1.0, "rewards/chosen": 4.546194553375244, "rewards/margins": 0.22762584686279297, "rewards/rejected": 4.318568706512451, "step": 7675 }, { "epoch": 1.7, "learning_rate": 5.824976785404879e-07, "logits/chosen": -1.7281233072280884, "logits/rejected": -1.6961588859558105, "logps/chosen": -41.278499603271484, "logps/rejected": -26.98407745361328, "loss": 0.189, "rewards/accuracies": 1.0, "rewards/chosen": 3.766538619995117, "rewards/margins": 1.4775810241699219, "rewards/rejected": 2.2889575958251953, "step": 7676 }, { "epoch": 1.7, "learning_rate": 5.816583816301197e-07, "logits/chosen": -1.5133183002471924, "logits/rejected": -1.5036031007766724, "logps/chosen": -6.339375972747803, "logps/rejected": -3.6954214572906494, "loss": 0.4217, "rewards/accuracies": 0.0, "rewards/chosen": 0.7222055792808533, "rewards/margins": -0.2455708384513855, "rewards/rejected": 0.9677764177322388, "step": 7677 }, { "epoch": 1.7, "learning_rate": 5.808196524660253e-07, "logits/chosen": -1.8505263328552246, "logits/rejected": -1.747137188911438, "logps/chosen": -49.324424743652344, "logps/rejected": -23.425033569335938, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 3.498790740966797, "rewards/margins": 2.9325807094573975, "rewards/rejected": 0.5662099719047546, "step": 7678 }, { "epoch": 1.7, "learning_rate": 5.799814911559842e-07, "logits/chosen": -2.0685904026031494, "logits/rejected": -2.115029811859131, "logps/chosen": -45.496742248535156, "logps/rejected": -86.20441436767578, "loss": 1.1982, "rewards/accuracies": 0.0, "rewards/chosen": 3.539238691329956, "rewards/margins": -2.2122561931610107, "rewards/rejected": 5.751494884490967, "step": 7679 }, { "epoch": 1.7, "learning_rate": 5.791438978076941e-07, "logits/chosen": -2.038626194000244, "logits/rejected": -1.9924681186676025, "logps/chosen": -54.21622848510742, "logps/rejected": -43.43634033203125, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": 6.349271774291992, "rewards/margins": 3.447735071182251, "rewards/rejected": 2.901536703109741, "step": 7680 }, { "epoch": 1.7, "learning_rate": 5.783068725287882e-07, "logits/chosen": -2.1379613876342773, "logits/rejected": -2.1489341259002686, "logps/chosen": -56.44422912597656, "logps/rejected": -131.74771118164062, "loss": 0.0862, "rewards/accuracies": 1.0, "rewards/chosen": 10.099588394165039, "rewards/margins": 1.737849235534668, "rewards/rejected": 8.361739158630371, "step": 7681 }, { "epoch": 1.7, "learning_rate": 5.774704154268184e-07, "logits/chosen": -1.9567159414291382, "logits/rejected": -1.8734687566757202, "logps/chosen": -150.36758422851562, "logps/rejected": -72.00165557861328, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": 3.2108705043792725, "rewards/margins": 2.5205795764923096, "rewards/rejected": 0.6902908682823181, "step": 7682 }, { "epoch": 1.7, "learning_rate": 5.766345266092688e-07, "logits/chosen": -2.311192512512207, "logits/rejected": -2.3193540573120117, "logps/chosen": -88.2291259765625, "logps/rejected": -84.48471069335938, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": 6.742236614227295, "rewards/margins": 2.4234466552734375, "rewards/rejected": 4.318789958953857, "step": 7683 }, { "epoch": 1.7, "learning_rate": 5.757992061835488e-07, "logits/chosen": -1.7096035480499268, "logits/rejected": -1.7306756973266602, "logps/chosen": -31.730234146118164, "logps/rejected": -57.24579620361328, "loss": 0.4416, "rewards/accuracies": 0.0, "rewards/chosen": 2.7212073802948, "rewards/margins": -0.34752964973449707, "rewards/rejected": 3.068737030029297, "step": 7684 }, { "epoch": 1.7, "learning_rate": 5.749644542569954e-07, "logits/chosen": -1.88556706905365, "logits/rejected": -1.875367283821106, "logps/chosen": -31.936567306518555, "logps/rejected": -57.974998474121094, "loss": 0.4982, "rewards/accuracies": 0.0, "rewards/chosen": 5.330322265625, "rewards/margins": -0.1068277359008789, "rewards/rejected": 5.437150001525879, "step": 7685 }, { "epoch": 1.7, "learning_rate": 5.741302709368729e-07, "logits/chosen": -2.1278762817382812, "logits/rejected": -1.9315788745880127, "logps/chosen": -116.5340576171875, "logps/rejected": -12.609322547912598, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 8.964380264282227, "rewards/margins": 5.661500930786133, "rewards/rejected": 3.3028793334960938, "step": 7686 }, { "epoch": 1.7, "learning_rate": 5.732966563303693e-07, "logits/chosen": -1.7393828630447388, "logits/rejected": -1.7363306283950806, "logps/chosen": -29.479713439941406, "logps/rejected": -47.97614669799805, "loss": 0.6549, "rewards/accuracies": 0.0, "rewards/chosen": 3.6397476196289062, "rewards/margins": -0.6524157524108887, "rewards/rejected": 4.292163372039795, "step": 7687 }, { "epoch": 1.7, "learning_rate": 5.724636105446063e-07, "logits/chosen": -2.0582127571105957, "logits/rejected": -2.0874245166778564, "logps/chosen": -102.19491577148438, "logps/rejected": -51.99498748779297, "loss": 0.1107, "rewards/accuracies": 1.0, "rewards/chosen": 7.035739421844482, "rewards/margins": 1.3987274169921875, "rewards/rejected": 5.637012004852295, "step": 7688 }, { "epoch": 1.7, "learning_rate": 5.716311336866237e-07, "logits/chosen": -2.019953489303589, "logits/rejected": -1.91952383518219, "logps/chosen": -131.27902221679688, "logps/rejected": -67.30589294433594, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 7.527517795562744, "rewards/margins": 3.051358222961426, "rewards/rejected": 4.476159572601318, "step": 7689 }, { "epoch": 1.7, "learning_rate": 5.707992258633965e-07, "logits/chosen": -2.0899605751037598, "logits/rejected": -2.0443265438079834, "logps/chosen": -102.57969665527344, "logps/rejected": -66.97586059570312, "loss": 0.0721, "rewards/accuracies": 1.0, "rewards/chosen": 5.328910827636719, "rewards/margins": 1.9444639682769775, "rewards/rejected": 3.384446859359741, "step": 7690 }, { "epoch": 1.7, "learning_rate": 5.699678871818204e-07, "logits/chosen": -2.057415723800659, "logits/rejected": -2.056973457336426, "logps/chosen": -88.19229125976562, "logps/rejected": -89.40139770507812, "loss": 0.2595, "rewards/accuracies": 1.0, "rewards/chosen": 8.387921333312988, "rewards/margins": 1.5468993186950684, "rewards/rejected": 6.84102201461792, "step": 7691 }, { "epoch": 1.7, "learning_rate": 5.691371177487215e-07, "logits/chosen": -2.0600802898406982, "logits/rejected": -2.0749878883361816, "logps/chosen": -52.12248229980469, "logps/rejected": -54.60131072998047, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": 4.099081516265869, "rewards/margins": 2.1005699634552, "rewards/rejected": 1.998511552810669, "step": 7692 }, { "epoch": 1.7, "learning_rate": 5.683069176708511e-07, "logits/chosen": -1.6847901344299316, "logits/rejected": -1.621461272239685, "logps/chosen": -36.71105194091797, "logps/rejected": -61.42456817626953, "loss": 0.1212, "rewards/accuracies": 1.0, "rewards/chosen": 7.1636223793029785, "rewards/margins": 1.334184169769287, "rewards/rejected": 5.829438209533691, "step": 7693 }, { "epoch": 1.7, "learning_rate": 5.67477287054889e-07, "logits/chosen": -1.8094584941864014, "logits/rejected": -1.8094584941864014, "logps/chosen": -23.962709426879883, "logps/rejected": -23.962709426879883, "loss": 0.3594, "rewards/accuracies": 0.0, "rewards/chosen": 1.4104880094528198, "rewards/margins": 0.0, "rewards/rejected": 1.4104880094528198, "step": 7694 }, { "epoch": 1.7, "learning_rate": 5.666482260074402e-07, "logits/chosen": -2.0044820308685303, "logits/rejected": -2.0186657905578613, "logps/chosen": -30.910736083984375, "logps/rejected": -24.10128402709961, "loss": 0.2809, "rewards/accuracies": 1.0, "rewards/chosen": 3.3003175258636475, "rewards/margins": 0.8018505573272705, "rewards/rejected": 2.498466968536377, "step": 7695 }, { "epoch": 1.7, "learning_rate": 5.658197346350353e-07, "logits/chosen": -1.701310634613037, "logits/rejected": -1.6385658979415894, "logps/chosen": -79.70050048828125, "logps/rejected": -109.68319702148438, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 6.690727233886719, "rewards/margins": 4.350717067718506, "rewards/rejected": 2.340010166168213, "step": 7696 }, { "epoch": 1.7, "learning_rate": 5.649918130441367e-07, "logits/chosen": -1.9781713485717773, "logits/rejected": -1.9263079166412354, "logps/chosen": -77.15776062011719, "logps/rejected": -86.03569793701172, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 6.984829902648926, "rewards/margins": 3.0595192909240723, "rewards/rejected": 3.9253106117248535, "step": 7697 }, { "epoch": 1.7, "learning_rate": 5.641644613411274e-07, "logits/chosen": -2.0780375003814697, "logits/rejected": -1.9786876440048218, "logps/chosen": -104.35175323486328, "logps/rejected": -47.72834014892578, "loss": 0.0748, "rewards/accuracies": 1.0, "rewards/chosen": 7.486587047576904, "rewards/margins": 2.1197071075439453, "rewards/rejected": 5.366879940032959, "step": 7698 }, { "epoch": 1.7, "learning_rate": 5.633376796323225e-07, "logits/chosen": -2.091059923171997, "logits/rejected": -2.048680305480957, "logps/chosen": -53.37950897216797, "logps/rejected": -20.15594482421875, "loss": 0.2518, "rewards/accuracies": 1.0, "rewards/chosen": 5.0527215003967285, "rewards/margins": 1.3178062438964844, "rewards/rejected": 3.734915256500244, "step": 7699 }, { "epoch": 1.7, "learning_rate": 5.625114680239596e-07, "logits/chosen": -1.8000781536102295, "logits/rejected": -1.7721879482269287, "logps/chosen": -63.407020568847656, "logps/rejected": -75.31427764892578, "loss": 0.8389, "rewards/accuracies": 1.0, "rewards/chosen": 4.176732063293457, "rewards/margins": 1.6068031787872314, "rewards/rejected": 2.5699288845062256, "step": 7700 }, { "epoch": 1.7, "learning_rate": 5.616858266222058e-07, "logits/chosen": -1.4970805644989014, "logits/rejected": -1.4970805644989014, "logps/chosen": -35.85214614868164, "logps/rejected": -35.85214614868164, "loss": 0.3744, "rewards/accuracies": 0.0, "rewards/chosen": 5.907927989959717, "rewards/margins": 0.0, "rewards/rejected": 5.907927989959717, "step": 7701 }, { "epoch": 1.7, "learning_rate": 5.608607555331541e-07, "logits/chosen": -1.980028748512268, "logits/rejected": -1.8392362594604492, "logps/chosen": -163.36114501953125, "logps/rejected": -144.37515258789062, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 7.61761474609375, "rewards/margins": 4.203123092651367, "rewards/rejected": 3.4144914150238037, "step": 7702 }, { "epoch": 1.7, "learning_rate": 5.600362548628235e-07, "logits/chosen": -2.285748243331909, "logits/rejected": -2.2463066577911377, "logps/chosen": -53.277774810791016, "logps/rejected": -21.217763900756836, "loss": 0.0783, "rewards/accuracies": 1.0, "rewards/chosen": 3.3803768157958984, "rewards/margins": 2.1378743648529053, "rewards/rejected": 1.2425024509429932, "step": 7703 }, { "epoch": 1.71, "learning_rate": 5.592123247171622e-07, "logits/chosen": -2.0412893295288086, "logits/rejected": -2.06329083442688, "logps/chosen": -39.851810455322266, "logps/rejected": -62.19957733154297, "loss": 0.9666, "rewards/accuracies": 0.0, "rewards/chosen": 2.7797679901123047, "rewards/margins": -0.8985326290130615, "rewards/rejected": 3.678300619125366, "step": 7704 }, { "epoch": 1.71, "learning_rate": 5.583889652020391e-07, "logits/chosen": -1.9524239301681519, "logits/rejected": -1.9093505144119263, "logps/chosen": -45.65056228637695, "logps/rejected": -60.568275451660156, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 5.191403865814209, "rewards/margins": 2.1446759700775146, "rewards/rejected": 3.0467278957366943, "step": 7705 }, { "epoch": 1.71, "learning_rate": 5.575661764232593e-07, "logits/chosen": -1.8856757879257202, "logits/rejected": -1.976723551750183, "logps/chosen": -33.14991760253906, "logps/rejected": -58.263328552246094, "loss": 0.9561, "rewards/accuracies": 0.0, "rewards/chosen": 4.16524600982666, "rewards/margins": -1.3930420875549316, "rewards/rejected": 5.558288097381592, "step": 7706 }, { "epoch": 1.71, "learning_rate": 5.567439584865442e-07, "logits/chosen": -1.840844750404358, "logits/rejected": -1.7425129413604736, "logps/chosen": -144.62124633789062, "logps/rejected": -62.51897048950195, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": 8.54309368133545, "rewards/margins": 2.840571880340576, "rewards/rejected": 5.702521800994873, "step": 7707 }, { "epoch": 1.71, "learning_rate": 5.55922311497551e-07, "logits/chosen": -1.6554266214370728, "logits/rejected": -1.6343212127685547, "logps/chosen": -15.213372230529785, "logps/rejected": -43.316627502441406, "loss": 0.1353, "rewards/accuracies": 1.0, "rewards/chosen": 1.4733821153640747, "rewards/margins": 1.210605502128601, "rewards/rejected": 0.26277658343315125, "step": 7708 }, { "epoch": 1.71, "learning_rate": 5.551012355618568e-07, "logits/chosen": -1.9054430723190308, "logits/rejected": -1.9054430723190308, "logps/chosen": -53.244728088378906, "logps/rejected": -53.244728088378906, "loss": 0.3559, "rewards/accuracies": 0.0, "rewards/chosen": 8.20845890045166, "rewards/margins": 0.0, "rewards/rejected": 8.20845890045166, "step": 7709 }, { "epoch": 1.71, "learning_rate": 5.542807307849684e-07, "logits/chosen": -2.140294313430786, "logits/rejected": -2.140294313430786, "logps/chosen": -38.636016845703125, "logps/rejected": -38.636016845703125, "loss": 0.3468, "rewards/accuracies": 0.0, "rewards/chosen": 5.310347080230713, "rewards/margins": 0.0, "rewards/rejected": 5.310347080230713, "step": 7710 }, { "epoch": 1.71, "learning_rate": 5.534607972723188e-07, "logits/chosen": -1.9234342575073242, "logits/rejected": -1.8857719898223877, "logps/chosen": -18.32224464416504, "logps/rejected": -174.3603973388672, "loss": 2.7044, "rewards/accuracies": 0.0, "rewards/chosen": 3.7305991649627686, "rewards/margins": -5.402605056762695, "rewards/rejected": 9.133204460144043, "step": 7711 }, { "epoch": 1.71, "learning_rate": 5.52641435129268e-07, "logits/chosen": -1.8564578294754028, "logits/rejected": -1.7846108675003052, "logps/chosen": -144.57835388183594, "logps/rejected": -147.28515625, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 11.140898704528809, "rewards/margins": 4.1876654624938965, "rewards/rejected": 6.953233242034912, "step": 7712 }, { "epoch": 1.71, "learning_rate": 5.518226444611024e-07, "logits/chosen": -2.0582218170166016, "logits/rejected": -2.0061886310577393, "logps/chosen": -100.38441467285156, "logps/rejected": -72.51270294189453, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": 6.986183166503906, "rewards/margins": 2.6911606788635254, "rewards/rejected": 4.295022487640381, "step": 7713 }, { "epoch": 1.71, "learning_rate": 5.510044253730318e-07, "logits/chosen": -2.1628520488739014, "logits/rejected": -2.1134111881256104, "logps/chosen": -122.6046142578125, "logps/rejected": -67.20988464355469, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 11.037549018859863, "rewards/margins": 6.674168586730957, "rewards/rejected": 4.363380432128906, "step": 7714 }, { "epoch": 1.71, "learning_rate": 5.501867779701997e-07, "logits/chosen": -1.8790066242218018, "logits/rejected": -1.8155393600463867, "logps/chosen": -35.28017807006836, "logps/rejected": -38.32717514038086, "loss": 0.3813, "rewards/accuracies": 1.0, "rewards/chosen": 4.0523295402526855, "rewards/margins": 1.2602217197418213, "rewards/rejected": 2.7921078205108643, "step": 7715 }, { "epoch": 1.71, "learning_rate": 5.493697023576672e-07, "logits/chosen": -1.8095449209213257, "logits/rejected": -1.7931104898452759, "logps/chosen": -27.35684585571289, "logps/rejected": -47.79631042480469, "loss": 1.0285, "rewards/accuracies": 0.0, "rewards/chosen": 2.7851085662841797, "rewards/margins": -0.0882267951965332, "rewards/rejected": 2.873335361480713, "step": 7716 }, { "epoch": 1.71, "learning_rate": 5.485531986404308e-07, "logits/chosen": -2.0319342613220215, "logits/rejected": -2.026726007461548, "logps/chosen": -49.34244918823242, "logps/rejected": -38.76435852050781, "loss": 0.4182, "rewards/accuracies": 1.0, "rewards/chosen": 3.3334362506866455, "rewards/margins": 0.06987643241882324, "rewards/rejected": 3.2635598182678223, "step": 7717 }, { "epoch": 1.71, "learning_rate": 5.477372669234071e-07, "logits/chosen": -1.9649242162704468, "logits/rejected": -1.9667489528656006, "logps/chosen": -46.835777282714844, "logps/rejected": -77.60975646972656, "loss": 0.5119, "rewards/accuracies": 1.0, "rewards/chosen": 3.5899658203125, "rewards/margins": 0.9366819858551025, "rewards/rejected": 2.6532838344573975, "step": 7718 }, { "epoch": 1.71, "learning_rate": 5.469219073114413e-07, "logits/chosen": -2.032785177230835, "logits/rejected": -2.021366834640503, "logps/chosen": -56.426937103271484, "logps/rejected": -62.281959533691406, "loss": 0.3925, "rewards/accuracies": 0.0, "rewards/chosen": 4.144521236419678, "rewards/margins": -0.1290435791015625, "rewards/rejected": 4.27356481552124, "step": 7719 }, { "epoch": 1.71, "learning_rate": 5.461071199093048e-07, "logits/chosen": -1.5457851886749268, "logits/rejected": -1.4704670906066895, "logps/chosen": -31.81894874572754, "logps/rejected": -43.143341064453125, "loss": 0.2243, "rewards/accuracies": 1.0, "rewards/chosen": 3.4094245433807373, "rewards/margins": 0.7055091857910156, "rewards/rejected": 2.7039153575897217, "step": 7720 }, { "epoch": 1.71, "learning_rate": 5.452929048216976e-07, "logits/chosen": -2.066248655319214, "logits/rejected": -2.0689308643341064, "logps/chosen": -57.68912887573242, "logps/rejected": -84.07300567626953, "loss": 1.2243, "rewards/accuracies": 0.0, "rewards/chosen": 2.345003843307495, "rewards/margins": -2.33223557472229, "rewards/rejected": 4.677239418029785, "step": 7721 }, { "epoch": 1.71, "learning_rate": 5.444792621532435e-07, "logits/chosen": -1.8687831163406372, "logits/rejected": -1.734580159187317, "logps/chosen": -91.97888946533203, "logps/rejected": -23.0357608795166, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": 3.0149505138397217, "rewards/margins": 2.829361915588379, "rewards/rejected": 0.18558864295482635, "step": 7722 }, { "epoch": 1.71, "learning_rate": 5.436661920084924e-07, "logits/chosen": -2.038325548171997, "logits/rejected": -1.9890775680541992, "logps/chosen": -92.58738708496094, "logps/rejected": -44.90298843383789, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": 4.3923234939575195, "rewards/margins": 2.4947469234466553, "rewards/rejected": 1.8975765705108643, "step": 7723 }, { "epoch": 1.71, "learning_rate": 5.428536944919238e-07, "logits/chosen": -1.9809622764587402, "logits/rejected": -1.950825810432434, "logps/chosen": -27.495380401611328, "logps/rejected": -53.201663970947266, "loss": 0.1805, "rewards/accuracies": 1.0, "rewards/chosen": 3.313383102416992, "rewards/margins": 0.8844292163848877, "rewards/rejected": 2.4289538860321045, "step": 7724 }, { "epoch": 1.71, "learning_rate": 5.420417697079394e-07, "logits/chosen": -1.8244825601577759, "logits/rejected": -1.730978012084961, "logps/chosen": -157.29385375976562, "logps/rejected": -25.06989288330078, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 4.150622844696045, "rewards/margins": 2.233713150024414, "rewards/rejected": 1.9169098138809204, "step": 7725 }, { "epoch": 1.71, "learning_rate": 5.412304177608729e-07, "logits/chosen": -1.8087478876113892, "logits/rejected": -1.6588548421859741, "logps/chosen": -70.54907989501953, "logps/rejected": -41.667381286621094, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 4.68187952041626, "rewards/margins": 2.833646297454834, "rewards/rejected": 1.8482331037521362, "step": 7726 }, { "epoch": 1.71, "learning_rate": 5.404196387549782e-07, "logits/chosen": -2.0496602058410645, "logits/rejected": -1.6497368812561035, "logps/chosen": -128.662109375, "logps/rejected": -56.59206771850586, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 8.55662727355957, "rewards/margins": 6.4367570877075195, "rewards/rejected": 2.11987042427063, "step": 7727 }, { "epoch": 1.71, "learning_rate": 5.39609432794439e-07, "logits/chosen": -2.247853994369507, "logits/rejected": -2.276977300643921, "logps/chosen": -23.235490798950195, "logps/rejected": -48.639793395996094, "loss": 1.303, "rewards/accuracies": 0.0, "rewards/chosen": 3.31974720954895, "rewards/margins": -2.51664137840271, "rewards/rejected": 5.83638858795166, "step": 7728 }, { "epoch": 1.71, "learning_rate": 5.387997999833655e-07, "logits/chosen": -2.189633369445801, "logits/rejected": -2.111116647720337, "logps/chosen": -124.04902648925781, "logps/rejected": -40.11894226074219, "loss": 0.0944, "rewards/accuracies": 1.0, "rewards/chosen": 3.700056552886963, "rewards/margins": 2.848560333251953, "rewards/rejected": 0.851496160030365, "step": 7729 }, { "epoch": 1.71, "learning_rate": 5.379907404257934e-07, "logits/chosen": -1.9105064868927002, "logits/rejected": -1.8109480142593384, "logps/chosen": -79.03136444091797, "logps/rejected": -43.05652618408203, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": 6.154326915740967, "rewards/margins": 2.9136950969696045, "rewards/rejected": 3.2406318187713623, "step": 7730 }, { "epoch": 1.71, "learning_rate": 5.371822542256843e-07, "logits/chosen": -1.8144084215164185, "logits/rejected": -1.836534857749939, "logps/chosen": -41.21366882324219, "logps/rejected": -72.40988159179688, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": 4.471817970275879, "rewards/margins": 1.709179162979126, "rewards/rejected": 2.762638807296753, "step": 7731 }, { "epoch": 1.71, "learning_rate": 5.363743414869276e-07, "logits/chosen": -2.0931930541992188, "logits/rejected": -2.0239200592041016, "logps/chosen": -148.16766357421875, "logps/rejected": -108.37185668945312, "loss": 0.3243, "rewards/accuracies": 1.0, "rewards/chosen": 7.8207855224609375, "rewards/margins": 1.9309844970703125, "rewards/rejected": 5.889801025390625, "step": 7732 }, { "epoch": 1.71, "learning_rate": 5.355670023133391e-07, "logits/chosen": -2.130286931991577, "logits/rejected": -2.1581716537475586, "logps/chosen": -66.85220336914062, "logps/rejected": -101.85969543457031, "loss": 0.0696, "rewards/accuracies": 1.0, "rewards/chosen": 9.11963176727295, "rewards/margins": 2.9911437034606934, "rewards/rejected": 6.128488063812256, "step": 7733 }, { "epoch": 1.71, "learning_rate": 5.347602368086563e-07, "logits/chosen": -1.9492127895355225, "logits/rejected": -1.9847080707550049, "logps/chosen": -103.25504302978516, "logps/rejected": -101.66178894042969, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 9.558062553405762, "rewards/margins": 3.365403652191162, "rewards/rejected": 6.1926589012146, "step": 7734 }, { "epoch": 1.71, "learning_rate": 5.339540450765507e-07, "logits/chosen": -2.0355119705200195, "logits/rejected": -2.0226573944091797, "logps/chosen": -41.39122009277344, "logps/rejected": -31.341991424560547, "loss": 0.4258, "rewards/accuracies": 1.0, "rewards/chosen": 4.709072113037109, "rewards/margins": 0.1899271011352539, "rewards/rejected": 4.5191450119018555, "step": 7735 }, { "epoch": 1.71, "learning_rate": 5.331484272206134e-07, "logits/chosen": -2.1518523693084717, "logits/rejected": -2.0921263694763184, "logps/chosen": -115.25387573242188, "logps/rejected": -58.94798278808594, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 6.746290683746338, "rewards/margins": 3.508427381515503, "rewards/rejected": 3.237863302230835, "step": 7736 }, { "epoch": 1.71, "learning_rate": 5.323433833443647e-07, "logits/chosen": -2.178053855895996, "logits/rejected": -2.178053855895996, "logps/chosen": -80.0473861694336, "logps/rejected": -80.0473861694336, "loss": 0.3488, "rewards/accuracies": 0.0, "rewards/chosen": 6.3287577629089355, "rewards/margins": 0.0, "rewards/rejected": 6.3287577629089355, "step": 7737 }, { "epoch": 1.71, "learning_rate": 5.315389135512523e-07, "logits/chosen": -1.8796151876449585, "logits/rejected": -1.8658397197723389, "logps/chosen": -34.09549331665039, "logps/rejected": -10.54680061340332, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": 5.198780536651611, "rewards/margins": 3.7354612350463867, "rewards/rejected": 1.463319182395935, "step": 7738 }, { "epoch": 1.71, "learning_rate": 5.307350179446469e-07, "logits/chosen": -1.9885530471801758, "logits/rejected": -1.9539092779159546, "logps/chosen": -97.1097183227539, "logps/rejected": -80.49573516845703, "loss": 0.0565, "rewards/accuracies": 1.0, "rewards/chosen": 10.01496410369873, "rewards/margins": 2.179192066192627, "rewards/rejected": 7.8357720375061035, "step": 7739 }, { "epoch": 1.71, "learning_rate": 5.299316966278478e-07, "logits/chosen": -1.9867068529129028, "logits/rejected": -1.9204293489456177, "logps/chosen": -43.64451599121094, "logps/rejected": -66.15283966064453, "loss": 0.6665, "rewards/accuracies": 1.0, "rewards/chosen": 5.28851842880249, "rewards/margins": 0.2642483711242676, "rewards/rejected": 5.024270057678223, "step": 7740 }, { "epoch": 1.71, "learning_rate": 5.291289497040803e-07, "logits/chosen": -1.9211069345474243, "logits/rejected": -1.6854006052017212, "logps/chosen": -45.91999053955078, "logps/rejected": -111.84390258789062, "loss": 0.2855, "rewards/accuracies": 1.0, "rewards/chosen": 8.330350875854492, "rewards/margins": 1.5102276802062988, "rewards/rejected": 6.820123195648193, "step": 7741 }, { "epoch": 1.71, "learning_rate": 5.28326777276496e-07, "logits/chosen": -1.9621905088424683, "logits/rejected": -1.9268463850021362, "logps/chosen": -175.67330932617188, "logps/rejected": -57.1573486328125, "loss": 0.0579, "rewards/accuracies": 1.0, "rewards/chosen": 6.972224712371826, "rewards/margins": 2.3270111083984375, "rewards/rejected": 4.645213603973389, "step": 7742 }, { "epoch": 1.71, "learning_rate": 5.275251794481689e-07, "logits/chosen": -1.6380788087844849, "logits/rejected": -1.5613605976104736, "logps/chosen": -26.90223503112793, "logps/rejected": -32.068763732910156, "loss": 0.1386, "rewards/accuracies": 1.0, "rewards/chosen": 1.8596200942993164, "rewards/margins": 1.1912903785705566, "rewards/rejected": 0.668329656124115, "step": 7743 }, { "epoch": 1.71, "learning_rate": 5.267241563221071e-07, "logits/chosen": -1.8701133728027344, "logits/rejected": -1.777783989906311, "logps/chosen": -130.84352111816406, "logps/rejected": -81.7312240600586, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 6.944836616516113, "rewards/margins": 5.341367721557617, "rewards/rejected": 1.6034691333770752, "step": 7744 }, { "epoch": 1.71, "learning_rate": 5.259237080012369e-07, "logits/chosen": -1.8891689777374268, "logits/rejected": -1.8315608501434326, "logps/chosen": -20.4512939453125, "logps/rejected": -3.3083157539367676, "loss": 0.284, "rewards/accuracies": 1.0, "rewards/chosen": 1.7479175329208374, "rewards/margins": 1.0270731449127197, "rewards/rejected": 0.7208443284034729, "step": 7745 }, { "epoch": 1.71, "learning_rate": 5.251238345884146e-07, "logits/chosen": -2.032010078430176, "logits/rejected": -2.004383087158203, "logps/chosen": -32.127235412597656, "logps/rejected": -64.7499771118164, "loss": 0.2387, "rewards/accuracies": 1.0, "rewards/chosen": 3.142273426055908, "rewards/margins": 0.7018520832061768, "rewards/rejected": 2.4404213428497314, "step": 7746 }, { "epoch": 1.71, "learning_rate": 5.243245361864219e-07, "logits/chosen": -1.6046470403671265, "logits/rejected": -1.5779428482055664, "logps/chosen": -21.311466217041016, "logps/rejected": -42.51075744628906, "loss": 1.2312, "rewards/accuracies": 0.0, "rewards/chosen": 2.659471273422241, "rewards/margins": -1.8222978115081787, "rewards/rejected": 4.48176908493042, "step": 7747 }, { "epoch": 1.71, "learning_rate": 5.235258128979676e-07, "logits/chosen": -2.107461929321289, "logits/rejected": -2.049053192138672, "logps/chosen": -92.54460906982422, "logps/rejected": -44.85110092163086, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 5.066565990447998, "rewards/margins": 3.2950098514556885, "rewards/rejected": 1.7715561389923096, "step": 7748 }, { "epoch": 1.72, "learning_rate": 5.227276648256851e-07, "logits/chosen": -2.159968137741089, "logits/rejected": -2.135960102081299, "logps/chosen": -31.498130798339844, "logps/rejected": -53.028221130371094, "loss": 0.533, "rewards/accuracies": 0.0, "rewards/chosen": 3.6493470668792725, "rewards/margins": -0.3524610996246338, "rewards/rejected": 4.001808166503906, "step": 7749 }, { "epoch": 1.72, "learning_rate": 5.219300920721343e-07, "logits/chosen": -2.2702243328094482, "logits/rejected": -2.2117998600006104, "logps/chosen": -139.07427978515625, "logps/rejected": -94.4638671875, "loss": 0.0655, "rewards/accuracies": 1.0, "rewards/chosen": 7.181997776031494, "rewards/margins": 1.9730424880981445, "rewards/rejected": 5.20895528793335, "step": 7750 }, { "epoch": 1.72, "learning_rate": 5.211330947398019e-07, "logits/chosen": -1.8697192668914795, "logits/rejected": -1.8726201057434082, "logps/chosen": -33.31468200683594, "logps/rejected": -50.84858322143555, "loss": 0.9208, "rewards/accuracies": 1.0, "rewards/chosen": 4.562963962554932, "rewards/margins": 1.545114517211914, "rewards/rejected": 3.0178494453430176, "step": 7751 }, { "epoch": 1.72, "learning_rate": 5.20336672931101e-07, "logits/chosen": -2.1182658672332764, "logits/rejected": -2.1266255378723145, "logps/chosen": -111.97396087646484, "logps/rejected": -84.6144027709961, "loss": 0.1585, "rewards/accuracies": 1.0, "rewards/chosen": 7.571727752685547, "rewards/margins": 3.305424213409424, "rewards/rejected": 4.266303539276123, "step": 7752 }, { "epoch": 1.72, "learning_rate": 5.195408267483676e-07, "logits/chosen": -1.9790422916412354, "logits/rejected": -1.8691915273666382, "logps/chosen": -65.55583953857422, "logps/rejected": -23.62279510498047, "loss": 0.234, "rewards/accuracies": 1.0, "rewards/chosen": 2.0755181312561035, "rewards/margins": 1.8826273679733276, "rewards/rejected": 0.19289074838161469, "step": 7753 }, { "epoch": 1.72, "learning_rate": 5.187455562938676e-07, "logits/chosen": -1.9989063739776611, "logits/rejected": -1.9335273504257202, "logps/chosen": -77.82249450683594, "logps/rejected": -71.97116088867188, "loss": 0.1476, "rewards/accuracies": 1.0, "rewards/chosen": 6.8022356033325195, "rewards/margins": 1.2272906303405762, "rewards/rejected": 5.574944972991943, "step": 7754 }, { "epoch": 1.72, "learning_rate": 5.179508616697909e-07, "logits/chosen": -1.980669617652893, "logits/rejected": -2.0085363388061523, "logps/chosen": -30.423419952392578, "logps/rejected": -88.56623077392578, "loss": 1.7311, "rewards/accuracies": 0.0, "rewards/chosen": 3.9630825519561768, "rewards/margins": -3.1988155841827393, "rewards/rejected": 7.161898136138916, "step": 7755 }, { "epoch": 1.72, "learning_rate": 5.171567429782537e-07, "logits/chosen": -1.9037253856658936, "logits/rejected": -1.9337693452835083, "logps/chosen": -44.55910110473633, "logps/rejected": -101.61880493164062, "loss": 1.3436, "rewards/accuracies": 0.0, "rewards/chosen": 5.6687331199646, "rewards/margins": -2.576569080352783, "rewards/rejected": 8.245302200317383, "step": 7756 }, { "epoch": 1.72, "learning_rate": 5.163632003212987e-07, "logits/chosen": -1.824588656425476, "logits/rejected": -1.746414065361023, "logps/chosen": -81.64736938476562, "logps/rejected": -44.96352005004883, "loss": 0.0785, "rewards/accuracies": 1.0, "rewards/chosen": 8.82348346710205, "rewards/margins": 1.9726243019104004, "rewards/rejected": 6.85085916519165, "step": 7757 }, { "epoch": 1.72, "learning_rate": 5.155702338008939e-07, "logits/chosen": -1.6471686363220215, "logits/rejected": -1.7301908731460571, "logps/chosen": -16.220914840698242, "logps/rejected": -103.17250061035156, "loss": 1.2814, "rewards/accuracies": 0.0, "rewards/chosen": 3.393955945968628, "rewards/margins": -1.7022340297698975, "rewards/rejected": 5.096189975738525, "step": 7758 }, { "epoch": 1.72, "learning_rate": 5.147778435189338e-07, "logits/chosen": -1.7830150127410889, "logits/rejected": -1.7644060850143433, "logps/chosen": -73.73638916015625, "logps/rejected": -46.02070617675781, "loss": 0.0896, "rewards/accuracies": 1.0, "rewards/chosen": 7.012732028961182, "rewards/margins": 1.633387565612793, "rewards/rejected": 5.379344463348389, "step": 7759 }, { "epoch": 1.72, "learning_rate": 5.139860295772387e-07, "logits/chosen": -2.1300134658813477, "logits/rejected": -2.119919538497925, "logps/chosen": -75.85260009765625, "logps/rejected": -76.8474349975586, "loss": 0.1956, "rewards/accuracies": 1.0, "rewards/chosen": 5.476233005523682, "rewards/margins": 1.548525333404541, "rewards/rejected": 3.9277076721191406, "step": 7760 }, { "epoch": 1.72, "learning_rate": 5.13194792077556e-07, "logits/chosen": -1.9110181331634521, "logits/rejected": -1.9318063259124756, "logps/chosen": -33.91239929199219, "logps/rejected": -66.09440612792969, "loss": 0.2775, "rewards/accuracies": 1.0, "rewards/chosen": 2.4839820861816406, "rewards/margins": 0.48598551750183105, "rewards/rejected": 1.9979965686798096, "step": 7761 }, { "epoch": 1.72, "learning_rate": 5.124041311215544e-07, "logits/chosen": -1.881555199623108, "logits/rejected": -1.9286487102508545, "logps/chosen": -72.25692749023438, "logps/rejected": -110.02436828613281, "loss": 0.4831, "rewards/accuracies": 0.0, "rewards/chosen": 4.0338592529296875, "rewards/margins": -0.4871230125427246, "rewards/rejected": 4.520982265472412, "step": 7762 }, { "epoch": 1.72, "learning_rate": 5.116140468108361e-07, "logits/chosen": -1.9105489253997803, "logits/rejected": -1.8906364440917969, "logps/chosen": -37.19432830810547, "logps/rejected": -50.15395736694336, "loss": 0.4704, "rewards/accuracies": 0.0, "rewards/chosen": 2.9745919704437256, "rewards/margins": -0.4459726810455322, "rewards/rejected": 3.420564651489258, "step": 7763 }, { "epoch": 1.72, "learning_rate": 5.108245392469219e-07, "logits/chosen": -1.9921915531158447, "logits/rejected": -1.8635395765304565, "logps/chosen": -136.492919921875, "logps/rejected": -58.38145065307617, "loss": 0.0558, "rewards/accuracies": 1.0, "rewards/chosen": 10.372159004211426, "rewards/margins": 6.188844203948975, "rewards/rejected": 4.183314800262451, "step": 7764 }, { "epoch": 1.72, "learning_rate": 5.100356085312624e-07, "logits/chosen": -2.2324094772338867, "logits/rejected": -2.1581766605377197, "logps/chosen": -55.32484817504883, "logps/rejected": -8.956429481506348, "loss": 0.5822, "rewards/accuracies": 0.0, "rewards/chosen": 3.4353625774383545, "rewards/margins": -0.0511777400970459, "rewards/rejected": 3.4865403175354004, "step": 7765 }, { "epoch": 1.72, "learning_rate": 5.092472547652338e-07, "logits/chosen": -1.8212138414382935, "logits/rejected": -1.8182746171951294, "logps/chosen": -43.75139617919922, "logps/rejected": -80.56432342529297, "loss": 0.2371, "rewards/accuracies": 1.0, "rewards/chosen": 9.294431686401367, "rewards/margins": 0.9495306015014648, "rewards/rejected": 8.344901084899902, "step": 7766 }, { "epoch": 1.72, "learning_rate": 5.084594780501378e-07, "logits/chosen": -1.7552841901779175, "logits/rejected": -1.7442013025283813, "logps/chosen": -114.82574462890625, "logps/rejected": -111.60493469238281, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": 12.21140193939209, "rewards/margins": 5.451114177703857, "rewards/rejected": 6.760287761688232, "step": 7767 }, { "epoch": 1.72, "learning_rate": 5.076722784872012e-07, "logits/chosen": -1.7742465734481812, "logits/rejected": -1.7742465734481812, "logps/chosen": -47.798770904541016, "logps/rejected": -47.798770904541016, "loss": 0.3497, "rewards/accuracies": 0.0, "rewards/chosen": 3.5744106769561768, "rewards/margins": 0.0, "rewards/rejected": 3.5744106769561768, "step": 7768 }, { "epoch": 1.72, "learning_rate": 5.068856561775782e-07, "logits/chosen": -1.7776868343353271, "logits/rejected": -1.6319596767425537, "logps/chosen": -50.82585144042969, "logps/rejected": -41.688636779785156, "loss": 0.1235, "rewards/accuracies": 1.0, "rewards/chosen": 6.207141876220703, "rewards/margins": 1.459712028503418, "rewards/rejected": 4.747429847717285, "step": 7769 }, { "epoch": 1.72, "learning_rate": 5.060996112223476e-07, "logits/chosen": -1.81939697265625, "logits/rejected": -1.771369218826294, "logps/chosen": -58.309051513671875, "logps/rejected": -31.41436195373535, "loss": 0.3585, "rewards/accuracies": 1.0, "rewards/chosen": 3.251971483230591, "rewards/margins": 0.08343291282653809, "rewards/rejected": 3.1685385704040527, "step": 7770 }, { "epoch": 1.72, "learning_rate": 5.053141437225117e-07, "logits/chosen": -1.6621155738830566, "logits/rejected": -1.609562873840332, "logps/chosen": -62.63121795654297, "logps/rejected": -60.19385528564453, "loss": 0.0589, "rewards/accuracies": 1.0, "rewards/chosen": 7.5777812004089355, "rewards/margins": 2.31087064743042, "rewards/rejected": 5.266910552978516, "step": 7771 }, { "epoch": 1.72, "learning_rate": 5.045292537790058e-07, "logits/chosen": -1.92550790309906, "logits/rejected": -1.902703046798706, "logps/chosen": -48.81919479370117, "logps/rejected": -58.12926483154297, "loss": 0.5761, "rewards/accuracies": 0.0, "rewards/chosen": 3.06632661819458, "rewards/margins": -0.7693743705749512, "rewards/rejected": 3.8357009887695312, "step": 7772 }, { "epoch": 1.72, "learning_rate": 5.037449414926832e-07, "logits/chosen": -1.6024789810180664, "logits/rejected": -1.5907024145126343, "logps/chosen": -37.93661880493164, "logps/rejected": -76.0214614868164, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 3.0490078926086426, "rewards/margins": 1.5605534315109253, "rewards/rejected": 1.4884544610977173, "step": 7773 }, { "epoch": 1.72, "learning_rate": 5.029612069643263e-07, "logits/chosen": -1.8035808801651, "logits/rejected": -1.7881205081939697, "logps/chosen": -43.99802017211914, "logps/rejected": -46.992889404296875, "loss": 0.2151, "rewards/accuracies": 1.0, "rewards/chosen": 3.417165756225586, "rewards/margins": 0.6421618461608887, "rewards/rejected": 2.7750039100646973, "step": 7774 }, { "epoch": 1.72, "learning_rate": 5.021780502946444e-07, "logits/chosen": -1.8339760303497314, "logits/rejected": -1.8203538656234741, "logps/chosen": -39.88166046142578, "logps/rejected": -73.43370056152344, "loss": 0.4458, "rewards/accuracies": 0.0, "rewards/chosen": 3.2594544887542725, "rewards/margins": -0.18311691284179688, "rewards/rejected": 3.4425714015960693, "step": 7775 }, { "epoch": 1.72, "learning_rate": 5.0139547158427e-07, "logits/chosen": -2.082369089126587, "logits/rejected": -2.1155498027801514, "logps/chosen": -92.03296661376953, "logps/rejected": -128.2681427001953, "loss": 0.0437, "rewards/accuracies": 1.0, "rewards/chosen": 8.090331077575684, "rewards/margins": 3.35292387008667, "rewards/rejected": 4.737407207489014, "step": 7776 }, { "epoch": 1.72, "learning_rate": 5.006134709337635e-07, "logits/chosen": -1.9187026023864746, "logits/rejected": -1.8940156698226929, "logps/chosen": -54.11513900756836, "logps/rejected": -47.87102127075195, "loss": 0.1757, "rewards/accuracies": 1.0, "rewards/chosen": 3.835078001022339, "rewards/margins": 0.9717178344726562, "rewards/rejected": 2.8633601665496826, "step": 7777 }, { "epoch": 1.72, "learning_rate": 4.998320484436098e-07, "logits/chosen": -1.5302181243896484, "logits/rejected": -1.5302181243896484, "logps/chosen": -98.58343505859375, "logps/rejected": -98.58343505859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 9.521079063415527, "rewards/margins": 0.0, "rewards/rejected": 9.521079063415527, "step": 7778 }, { "epoch": 1.72, "learning_rate": 4.990512042142204e-07, "logits/chosen": -2.013890504837036, "logits/rejected": -2.0053694248199463, "logps/chosen": -156.27745056152344, "logps/rejected": -79.32962036132812, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 8.908332824707031, "rewards/margins": 3.819422721862793, "rewards/rejected": 5.088910102844238, "step": 7779 }, { "epoch": 1.72, "learning_rate": 4.982709383459299e-07, "logits/chosen": -1.912622332572937, "logits/rejected": -1.8585879802703857, "logps/chosen": -39.78377914428711, "logps/rejected": -15.751130104064941, "loss": 0.1064, "rewards/accuracies": 1.0, "rewards/chosen": 2.6118342876434326, "rewards/margins": 1.7668938636779785, "rewards/rejected": 0.8449404835700989, "step": 7780 }, { "epoch": 1.72, "learning_rate": 4.974912509390034e-07, "logits/chosen": -1.8923656940460205, "logits/rejected": -1.8923656940460205, "logps/chosen": -34.030548095703125, "logps/rejected": -34.030548095703125, "loss": 0.3497, "rewards/accuracies": 0.0, "rewards/chosen": 4.052262306213379, "rewards/margins": 0.0, "rewards/rejected": 4.052262306213379, "step": 7781 }, { "epoch": 1.72, "learning_rate": 4.967121420936255e-07, "logits/chosen": -1.8835110664367676, "logits/rejected": -1.7939419746398926, "logps/chosen": -106.34677124023438, "logps/rejected": -51.566341400146484, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": 4.800424098968506, "rewards/margins": 2.2339494228363037, "rewards/rejected": 2.566474676132202, "step": 7782 }, { "epoch": 1.72, "learning_rate": 4.95933611909914e-07, "logits/chosen": -1.7504082918167114, "logits/rejected": -1.6852622032165527, "logps/chosen": -90.97908020019531, "logps/rejected": -130.49862670898438, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": 8.736689567565918, "rewards/margins": 4.391395092010498, "rewards/rejected": 4.34529447555542, "step": 7783 }, { "epoch": 1.72, "learning_rate": 4.951556604879049e-07, "logits/chosen": -1.9333797693252563, "logits/rejected": -1.9333797693252563, "logps/chosen": -36.24980163574219, "logps/rejected": -36.24980163574219, "loss": 0.3591, "rewards/accuracies": 0.0, "rewards/chosen": 3.1911301612854004, "rewards/margins": 0.0, "rewards/rejected": 3.1911301612854004, "step": 7784 }, { "epoch": 1.72, "learning_rate": 4.943782879275639e-07, "logits/chosen": -1.9753906726837158, "logits/rejected": -1.9954994916915894, "logps/chosen": -35.75990295410156, "logps/rejected": -53.38823699951172, "loss": 0.7894, "rewards/accuracies": 0.0, "rewards/chosen": 3.118154287338257, "rewards/margins": -0.8684370517730713, "rewards/rejected": 3.986591339111328, "step": 7785 }, { "epoch": 1.72, "learning_rate": 4.936014943287815e-07, "logits/chosen": -1.9577621221542358, "logits/rejected": -1.9573955535888672, "logps/chosen": -31.46612548828125, "logps/rejected": -51.765113830566406, "loss": 0.5514, "rewards/accuracies": 0.0, "rewards/chosen": 3.9978272914886475, "rewards/margins": -0.6081993579864502, "rewards/rejected": 4.606026649475098, "step": 7786 }, { "epoch": 1.72, "learning_rate": 4.928252797913741e-07, "logits/chosen": -2.323512077331543, "logits/rejected": -2.2415921688079834, "logps/chosen": -51.06684494018555, "logps/rejected": -90.60208129882812, "loss": 0.32, "rewards/accuracies": 1.0, "rewards/chosen": 4.764275074005127, "rewards/margins": 0.6416006088256836, "rewards/rejected": 4.122674465179443, "step": 7787 }, { "epoch": 1.72, "learning_rate": 4.920496444150847e-07, "logits/chosen": -2.0969889163970947, "logits/rejected": -2.0478506088256836, "logps/chosen": -46.84496307373047, "logps/rejected": -40.350650787353516, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 3.479999542236328, "rewards/margins": 4.128931045532227, "rewards/rejected": -0.6489315032958984, "step": 7788 }, { "epoch": 1.72, "learning_rate": 4.912745882995767e-07, "logits/chosen": -2.042989492416382, "logits/rejected": -2.022705554962158, "logps/chosen": -59.05703353881836, "logps/rejected": -79.1821517944336, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 5.7621283531188965, "rewards/margins": 2.706338405609131, "rewards/rejected": 3.0557899475097656, "step": 7789 }, { "epoch": 1.72, "learning_rate": 4.905001115444475e-07, "logits/chosen": -1.972562313079834, "logits/rejected": -1.9835306406021118, "logps/chosen": -103.41372680664062, "logps/rejected": -88.86009216308594, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 6.841866970062256, "rewards/margins": 2.565286159515381, "rewards/rejected": 4.276580810546875, "step": 7790 }, { "epoch": 1.72, "learning_rate": 4.897262142492116e-07, "logits/chosen": -1.8210158348083496, "logits/rejected": -1.8551453351974487, "logps/chosen": -39.91647720336914, "logps/rejected": -98.04383087158203, "loss": 0.2942, "rewards/accuracies": 1.0, "rewards/chosen": 4.232111930847168, "rewards/margins": 0.3524434566497803, "rewards/rejected": 3.8796684741973877, "step": 7791 }, { "epoch": 1.72, "learning_rate": 4.889528965133167e-07, "logits/chosen": -2.012687921524048, "logits/rejected": -1.9883571863174438, "logps/chosen": -44.89017868041992, "logps/rejected": -60.746482849121094, "loss": 0.1423, "rewards/accuracies": 1.0, "rewards/chosen": 5.544569969177246, "rewards/margins": 1.1303234100341797, "rewards/rejected": 4.414246559143066, "step": 7792 }, { "epoch": 1.72, "learning_rate": 4.88180158436129e-07, "logits/chosen": -1.9083504676818848, "logits/rejected": -2.003129243850708, "logps/chosen": -24.753520965576172, "logps/rejected": -92.9784164428711, "loss": 2.0304, "rewards/accuracies": 0.0, "rewards/chosen": 4.56746244430542, "rewards/margins": -3.025693416595459, "rewards/rejected": 7.593155860900879, "step": 7793 }, { "epoch": 1.73, "learning_rate": 4.874080001169457e-07, "logits/chosen": -1.7554149627685547, "logits/rejected": -1.7059561014175415, "logps/chosen": -42.41843795776367, "logps/rejected": -46.49840545654297, "loss": 0.2772, "rewards/accuracies": 1.0, "rewards/chosen": 3.3034169673919678, "rewards/margins": 0.49134182929992676, "rewards/rejected": 2.812075138092041, "step": 7794 }, { "epoch": 1.73, "learning_rate": 4.866364216549868e-07, "logits/chosen": -1.8427155017852783, "logits/rejected": -1.7549635171890259, "logps/chosen": -63.124637603759766, "logps/rejected": -26.792531967163086, "loss": 0.5382, "rewards/accuracies": 1.0, "rewards/chosen": 2.760695219039917, "rewards/margins": 2.1991233825683594, "rewards/rejected": 0.5615717172622681, "step": 7795 }, { "epoch": 1.73, "learning_rate": 4.858654231493959e-07, "logits/chosen": -1.9898467063903809, "logits/rejected": -1.8033661842346191, "logps/chosen": -85.68913269042969, "logps/rejected": -26.653127670288086, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": 4.686221599578857, "rewards/margins": 3.112943649291992, "rewards/rejected": 1.5732778310775757, "step": 7796 }, { "epoch": 1.73, "learning_rate": 4.85095004699248e-07, "logits/chosen": -1.7683258056640625, "logits/rejected": -1.7683258056640625, "logps/chosen": -36.63743209838867, "logps/rejected": -36.63743209838867, "loss": 1.332, "rewards/accuracies": 0.0, "rewards/chosen": 3.425593137741089, "rewards/margins": 0.0, "rewards/rejected": 3.425593137741089, "step": 7797 }, { "epoch": 1.73, "learning_rate": 4.843251664035376e-07, "logits/chosen": -1.9093220233917236, "logits/rejected": -1.6960997581481934, "logps/chosen": -122.86164855957031, "logps/rejected": -55.831695556640625, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": 7.423140048980713, "rewards/margins": 7.114468097686768, "rewards/rejected": 0.3086719512939453, "step": 7798 }, { "epoch": 1.73, "learning_rate": 4.835559083611891e-07, "logits/chosen": -2.05561900138855, "logits/rejected": -2.019619941711426, "logps/chosen": -80.20201110839844, "logps/rejected": -64.82638549804688, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 6.05751371383667, "rewards/margins": 2.785735607147217, "rewards/rejected": 3.271778106689453, "step": 7799 }, { "epoch": 1.73, "learning_rate": 4.827872306710474e-07, "logits/chosen": -1.7465671300888062, "logits/rejected": -1.7465671300888062, "logps/chosen": -37.48023986816406, "logps/rejected": -37.48023986816406, "loss": 0.3531, "rewards/accuracies": 0.0, "rewards/chosen": 3.5035042762756348, "rewards/margins": 0.0, "rewards/rejected": 3.5035042762756348, "step": 7800 }, { "epoch": 1.73, "learning_rate": 4.820191334318891e-07, "logits/chosen": -1.9706467390060425, "logits/rejected": -1.8809583187103271, "logps/chosen": -82.64012145996094, "logps/rejected": -62.89190673828125, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 3.351165771484375, "rewards/margins": 2.6149115562438965, "rewards/rejected": 0.7362541556358337, "step": 7801 }, { "epoch": 1.73, "learning_rate": 4.812516167424108e-07, "logits/chosen": -2.0995001792907715, "logits/rejected": -1.9992622137069702, "logps/chosen": -84.71931457519531, "logps/rejected": -59.82646179199219, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 7.433860778808594, "rewards/margins": 2.0175671577453613, "rewards/rejected": 5.416293621063232, "step": 7802 }, { "epoch": 1.73, "learning_rate": 4.80484680701237e-07, "logits/chosen": -1.9988360404968262, "logits/rejected": -1.892215609550476, "logps/chosen": -142.19747924804688, "logps/rejected": -144.16363525390625, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": 8.626861572265625, "rewards/margins": 2.0181241035461426, "rewards/rejected": 6.608737468719482, "step": 7803 }, { "epoch": 1.73, "learning_rate": 4.797183254069176e-07, "logits/chosen": -1.9827724695205688, "logits/rejected": -1.9773703813552856, "logps/chosen": -59.92475891113281, "logps/rejected": -68.93183135986328, "loss": 0.1694, "rewards/accuracies": 1.0, "rewards/chosen": 8.346529006958008, "rewards/margins": 0.9352946281433105, "rewards/rejected": 7.411234378814697, "step": 7804 }, { "epoch": 1.73, "learning_rate": 4.789525509579251e-07, "logits/chosen": -1.9203026294708252, "logits/rejected": -1.8398104906082153, "logps/chosen": -41.134376525878906, "logps/rejected": -6.552794456481934, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": 3.283644914627075, "rewards/margins": 2.0901131629943848, "rewards/rejected": 1.1935317516326904, "step": 7805 }, { "epoch": 1.73, "learning_rate": 4.781873574526635e-07, "logits/chosen": -1.7021116018295288, "logits/rejected": -1.760796308517456, "logps/chosen": -7.792922019958496, "logps/rejected": -34.172149658203125, "loss": 1.9055, "rewards/accuracies": 0.0, "rewards/chosen": 1.7282211780548096, "rewards/margins": -3.0863001346588135, "rewards/rejected": 4.814521312713623, "step": 7806 }, { "epoch": 1.73, "learning_rate": 4.774227449894548e-07, "logits/chosen": -2.041088104248047, "logits/rejected": -1.999531865119934, "logps/chosen": -59.77481460571289, "logps/rejected": -52.630210876464844, "loss": 0.2029, "rewards/accuracies": 1.0, "rewards/chosen": 4.093503952026367, "rewards/margins": 1.2119433879852295, "rewards/rejected": 2.8815605640411377, "step": 7807 }, { "epoch": 1.73, "learning_rate": 4.766587136665529e-07, "logits/chosen": -2.026893377304077, "logits/rejected": -2.026893377304077, "logps/chosen": -39.805850982666016, "logps/rejected": -39.805850982666016, "loss": 0.368, "rewards/accuracies": 0.0, "rewards/chosen": 4.67023229598999, "rewards/margins": 0.0, "rewards/rejected": 4.67023229598999, "step": 7808 }, { "epoch": 1.73, "learning_rate": 4.758952635821307e-07, "logits/chosen": -2.2856171131134033, "logits/rejected": -2.321200370788574, "logps/chosen": -130.4105987548828, "logps/rejected": -92.95452880859375, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": 10.62994384765625, "rewards/margins": 2.3342952728271484, "rewards/rejected": 8.295648574829102, "step": 7809 }, { "epoch": 1.73, "learning_rate": 4.751323948342934e-07, "logits/chosen": -2.011700391769409, "logits/rejected": -2.0295028686523438, "logps/chosen": -61.15313720703125, "logps/rejected": -95.25390625, "loss": 0.9334, "rewards/accuracies": 0.0, "rewards/chosen": 6.942073345184326, "rewards/margins": -1.6829218864440918, "rewards/rejected": 8.624995231628418, "step": 7810 }, { "epoch": 1.73, "learning_rate": 4.743701075210649e-07, "logits/chosen": -1.7834715843200684, "logits/rejected": -1.767399787902832, "logps/chosen": -62.83479309082031, "logps/rejected": -78.69734954833984, "loss": 0.2127, "rewards/accuracies": 1.0, "rewards/chosen": 6.976296901702881, "rewards/margins": 2.382477283477783, "rewards/rejected": 4.593819618225098, "step": 7811 }, { "epoch": 1.73, "learning_rate": 4.73608401740398e-07, "logits/chosen": -1.805299162864685, "logits/rejected": -1.7015701532363892, "logps/chosen": -85.8760986328125, "logps/rejected": -39.85555648803711, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 4.809289455413818, "rewards/margins": 4.540318489074707, "rewards/rejected": 0.26897087693214417, "step": 7812 }, { "epoch": 1.73, "learning_rate": 4.7284727759017044e-07, "logits/chosen": -1.9754713773727417, "logits/rejected": -1.9754713773727417, "logps/chosen": -22.674396514892578, "logps/rejected": -22.674396514892578, "loss": 0.3467, "rewards/accuracies": 0.0, "rewards/chosen": 4.870149612426758, "rewards/margins": 0.0, "rewards/rejected": 4.870149612426758, "step": 7813 }, { "epoch": 1.73, "learning_rate": 4.7208673516818517e-07, "logits/chosen": -2.157670259475708, "logits/rejected": -2.1175904273986816, "logps/chosen": -141.66934204101562, "logps/rejected": -55.911712646484375, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 7.105668544769287, "rewards/margins": 4.468935966491699, "rewards/rejected": 2.636732578277588, "step": 7814 }, { "epoch": 1.73, "learning_rate": 4.7132677457217004e-07, "logits/chosen": -1.879457712173462, "logits/rejected": -1.8381167650222778, "logps/chosen": -86.311279296875, "logps/rejected": -25.640871047973633, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 5.781932353973389, "rewards/margins": 2.287379026412964, "rewards/rejected": 3.494553327560425, "step": 7815 }, { "epoch": 1.73, "learning_rate": 4.705673958997764e-07, "logits/chosen": -1.906182050704956, "logits/rejected": -1.8302849531173706, "logps/chosen": -64.61849975585938, "logps/rejected": -17.292388916015625, "loss": 0.1304, "rewards/accuracies": 1.0, "rewards/chosen": 2.5222153663635254, "rewards/margins": 1.9246766567230225, "rewards/rejected": 0.5975387692451477, "step": 7816 }, { "epoch": 1.73, "learning_rate": 4.698085992485851e-07, "logits/chosen": -2.099681854248047, "logits/rejected": -2.108593225479126, "logps/chosen": -49.839969635009766, "logps/rejected": -35.08588409423828, "loss": 0.3005, "rewards/accuracies": 1.0, "rewards/chosen": 4.469985485076904, "rewards/margins": 1.0459635257720947, "rewards/rejected": 3.4240219593048096, "step": 7817 }, { "epoch": 1.73, "learning_rate": 4.690503847160982e-07, "logits/chosen": -1.780393362045288, "logits/rejected": -1.9235937595367432, "logps/chosen": -41.603824615478516, "logps/rejected": -104.17422485351562, "loss": 2.5911, "rewards/accuracies": 0.0, "rewards/chosen": 2.6181721687316895, "rewards/margins": -5.117374897003174, "rewards/rejected": 7.735547065734863, "step": 7818 }, { "epoch": 1.73, "learning_rate": 4.682927523997444e-07, "logits/chosen": -1.8769036531448364, "logits/rejected": -1.8726319074630737, "logps/chosen": -24.140695571899414, "logps/rejected": -26.58623504638672, "loss": 0.6438, "rewards/accuracies": 0.0, "rewards/chosen": 1.265642762184143, "rewards/margins": -0.8987394571304321, "rewards/rejected": 2.164382219314575, "step": 7819 }, { "epoch": 1.73, "learning_rate": 4.675357023968785e-07, "logits/chosen": -1.8582597970962524, "logits/rejected": -1.8251869678497314, "logps/chosen": -64.30115509033203, "logps/rejected": -69.30148315429688, "loss": 0.2909, "rewards/accuracies": 1.0, "rewards/chosen": 4.386879920959473, "rewards/margins": 1.394352912902832, "rewards/rejected": 2.9925270080566406, "step": 7820 }, { "epoch": 1.73, "learning_rate": 4.6677923480477837e-07, "logits/chosen": -1.4940038919448853, "logits/rejected": -1.4940038919448853, "logps/chosen": -2.652191162109375, "logps/rejected": -2.652191162109375, "loss": 0.3492, "rewards/accuracies": 0.0, "rewards/chosen": 1.1124886274337769, "rewards/margins": 0.0, "rewards/rejected": 1.1124886274337769, "step": 7821 }, { "epoch": 1.73, "learning_rate": 4.6602334972064956e-07, "logits/chosen": -1.964072346687317, "logits/rejected": -1.9509758949279785, "logps/chosen": -54.65509796142578, "logps/rejected": -69.59603881835938, "loss": 0.1637, "rewards/accuracies": 1.0, "rewards/chosen": 5.191712379455566, "rewards/margins": 0.9486551284790039, "rewards/rejected": 4.2430572509765625, "step": 7822 }, { "epoch": 1.73, "learning_rate": 4.6526804724162043e-07, "logits/chosen": -1.9283794164657593, "logits/rejected": -1.913187861442566, "logps/chosen": -75.85584259033203, "logps/rejected": -45.088531494140625, "loss": 0.2037, "rewards/accuracies": 1.0, "rewards/chosen": 5.188003063201904, "rewards/margins": 0.7701377868652344, "rewards/rejected": 4.41786527633667, "step": 7823 }, { "epoch": 1.73, "learning_rate": 4.645133274647473e-07, "logits/chosen": -1.6093082427978516, "logits/rejected": -1.473105549812317, "logps/chosen": -39.06652069091797, "logps/rejected": -83.21615600585938, "loss": 1.5778, "rewards/accuracies": 0.0, "rewards/chosen": 3.342810869216919, "rewards/margins": -2.373464345932007, "rewards/rejected": 5.716275215148926, "step": 7824 }, { "epoch": 1.73, "learning_rate": 4.6375919048700704e-07, "logits/chosen": -1.5422378778457642, "logits/rejected": -1.5422378778457642, "logps/chosen": -15.142745971679688, "logps/rejected": -15.142745971679688, "loss": 0.3596, "rewards/accuracies": 0.0, "rewards/chosen": 1.5446758270263672, "rewards/margins": 0.0, "rewards/rejected": 1.5446758270263672, "step": 7825 }, { "epoch": 1.73, "learning_rate": 4.630056364053076e-07, "logits/chosen": -2.0338897705078125, "logits/rejected": -2.0484185218811035, "logps/chosen": -64.42575073242188, "logps/rejected": -48.53340148925781, "loss": 0.3053, "rewards/accuracies": 1.0, "rewards/chosen": 4.0363969802856445, "rewards/margins": 1.356013536453247, "rewards/rejected": 2.6803834438323975, "step": 7826 }, { "epoch": 1.73, "learning_rate": 4.6225266531647605e-07, "logits/chosen": -1.7556512355804443, "logits/rejected": -1.7556512355804443, "logps/chosen": -31.50450325012207, "logps/rejected": -31.50450325012207, "loss": 0.3546, "rewards/accuracies": 0.0, "rewards/chosen": 1.5228430032730103, "rewards/margins": 0.0, "rewards/rejected": 1.5228430032730103, "step": 7827 }, { "epoch": 1.73, "learning_rate": 4.615002773172689e-07, "logits/chosen": -1.81288743019104, "logits/rejected": -1.8510440587997437, "logps/chosen": -166.4683074951172, "logps/rejected": -140.7837371826172, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 8.030634880065918, "rewards/margins": 4.32137393951416, "rewards/rejected": 3.7092607021331787, "step": 7828 }, { "epoch": 1.73, "learning_rate": 4.607484725043654e-07, "logits/chosen": -2.0600576400756836, "logits/rejected": -2.0464377403259277, "logps/chosen": -86.95539093017578, "logps/rejected": -84.13240814208984, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 6.41457462310791, "rewards/margins": 3.1028215885162354, "rewards/rejected": 3.311753034591675, "step": 7829 }, { "epoch": 1.73, "learning_rate": 4.5999725097437167e-07, "logits/chosen": -1.99717378616333, "logits/rejected": -1.995236873626709, "logps/chosen": -28.76647186279297, "logps/rejected": -73.76856994628906, "loss": 0.7841, "rewards/accuracies": 0.0, "rewards/chosen": 3.2630774974823, "rewards/margins": -1.3158748149871826, "rewards/rejected": 4.578952312469482, "step": 7830 }, { "epoch": 1.73, "learning_rate": 4.5924661282381713e-07, "logits/chosen": -2.204244613647461, "logits/rejected": -2.1206583976745605, "logps/chosen": -141.66664123535156, "logps/rejected": -14.636058807373047, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 5.226048469543457, "rewards/margins": 4.006021022796631, "rewards/rejected": 1.2200275659561157, "step": 7831 }, { "epoch": 1.73, "learning_rate": 4.5849655814915683e-07, "logits/chosen": -1.7562013864517212, "logits/rejected": -1.9005191326141357, "logps/chosen": -54.24643325805664, "logps/rejected": -72.2502212524414, "loss": 3.262, "rewards/accuracies": 0.0, "rewards/chosen": 5.703246593475342, "rewards/margins": -6.486111164093018, "rewards/rejected": 12.18935775756836, "step": 7832 }, { "epoch": 1.73, "learning_rate": 4.577470870467715e-07, "logits/chosen": -2.1639115810394287, "logits/rejected": -2.023763656616211, "logps/chosen": -121.73212432861328, "logps/rejected": -48.271541595458984, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 8.814334869384766, "rewards/margins": 9.009163856506348, "rewards/rejected": -0.19482918083667755, "step": 7833 }, { "epoch": 1.73, "learning_rate": 4.569981996129663e-07, "logits/chosen": -1.5188319683074951, "logits/rejected": -1.5285407304763794, "logps/chosen": -35.837615966796875, "logps/rejected": -71.38958740234375, "loss": 0.6294, "rewards/accuracies": 0.0, "rewards/chosen": 3.2411956787109375, "rewards/margins": -0.8828811645507812, "rewards/rejected": 4.124076843261719, "step": 7834 }, { "epoch": 1.73, "learning_rate": 4.562498959439726e-07, "logits/chosen": -1.9852720499038696, "logits/rejected": -1.9881515502929688, "logps/chosen": -105.10144805908203, "logps/rejected": -402.95599365234375, "loss": 0.3758, "rewards/accuracies": 0.0, "rewards/chosen": 13.084558486938477, "rewards/margins": -0.04121112823486328, "rewards/rejected": 13.12576961517334, "step": 7835 }, { "epoch": 1.73, "learning_rate": 4.555021761359435e-07, "logits/chosen": -2.202174663543701, "logits/rejected": -2.1820366382598877, "logps/chosen": -49.76165008544922, "logps/rejected": -79.94320678710938, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": 4.822188854217529, "rewards/margins": 2.0464189052581787, "rewards/rejected": 2.7757699489593506, "step": 7836 }, { "epoch": 1.73, "learning_rate": 4.5475504028495934e-07, "logits/chosen": -1.8444021940231323, "logits/rejected": -1.7669758796691895, "logps/chosen": -53.451534271240234, "logps/rejected": -45.193702697753906, "loss": 0.5733, "rewards/accuracies": 0.0, "rewards/chosen": 3.2409160137176514, "rewards/margins": -0.7523953914642334, "rewards/rejected": 3.9933114051818848, "step": 7837 }, { "epoch": 1.73, "learning_rate": 4.5400848848702663e-07, "logits/chosen": -1.895556926727295, "logits/rejected": -1.767853856086731, "logps/chosen": -63.90044021606445, "logps/rejected": -5.898278713226318, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": 2.9593029022216797, "rewards/margins": 1.8998204469680786, "rewards/rejected": 1.059482455253601, "step": 7838 }, { "epoch": 1.74, "learning_rate": 4.532625208380748e-07, "logits/chosen": -2.151139736175537, "logits/rejected": -2.1550722122192383, "logps/chosen": -44.36944580078125, "logps/rejected": -61.170082092285156, "loss": 0.9275, "rewards/accuracies": 1.0, "rewards/chosen": 3.18015456199646, "rewards/margins": 0.20844650268554688, "rewards/rejected": 2.971708059310913, "step": 7839 }, { "epoch": 1.74, "learning_rate": 4.5251713743395877e-07, "logits/chosen": -1.9176065921783447, "logits/rejected": -1.8623530864715576, "logps/chosen": -76.82613372802734, "logps/rejected": -44.73584747314453, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": 6.934183597564697, "rewards/margins": 1.6985702514648438, "rewards/rejected": 5.2356133460998535, "step": 7840 }, { "epoch": 1.74, "learning_rate": 4.517723383704592e-07, "logits/chosen": -2.1686391830444336, "logits/rejected": -2.1896305084228516, "logps/chosen": -42.99051284790039, "logps/rejected": -157.96395874023438, "loss": 0.9312, "rewards/accuracies": 0.0, "rewards/chosen": 9.787187576293945, "rewards/margins": -1.3259258270263672, "rewards/rejected": 11.113113403320312, "step": 7841 }, { "epoch": 1.74, "learning_rate": 4.510281237432801e-07, "logits/chosen": -2.000596761703491, "logits/rejected": -1.858634352684021, "logps/chosen": -129.2758331298828, "logps/rejected": -70.22543334960938, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 7.870456218719482, "rewards/margins": 3.743399143218994, "rewards/rejected": 4.127057075500488, "step": 7842 }, { "epoch": 1.74, "learning_rate": 4.502844936480522e-07, "logits/chosen": -1.892037272453308, "logits/rejected": -1.892037272453308, "logps/chosen": -29.34975814819336, "logps/rejected": -29.34975814819336, "loss": 0.3551, "rewards/accuracies": 0.0, "rewards/chosen": 4.807487964630127, "rewards/margins": 0.0, "rewards/rejected": 4.807487964630127, "step": 7843 }, { "epoch": 1.74, "learning_rate": 4.495414481803301e-07, "logits/chosen": -2.1041133403778076, "logits/rejected": -2.1041133403778076, "logps/chosen": -37.090572357177734, "logps/rejected": -37.090572357177734, "loss": 0.3469, "rewards/accuracies": 0.0, "rewards/chosen": 4.785168170928955, "rewards/margins": 0.0, "rewards/rejected": 4.785168170928955, "step": 7844 }, { "epoch": 1.74, "learning_rate": 4.487989874355919e-07, "logits/chosen": -1.7663615942001343, "logits/rejected": -1.7915033102035522, "logps/chosen": -50.65501022338867, "logps/rejected": -66.72492980957031, "loss": 1.2331, "rewards/accuracies": 0.0, "rewards/chosen": 4.649673938751221, "rewards/margins": -2.373350143432617, "rewards/rejected": 7.023024082183838, "step": 7845 }, { "epoch": 1.74, "learning_rate": 4.4805711150924304e-07, "logits/chosen": -2.090245008468628, "logits/rejected": -2.069246768951416, "logps/chosen": -64.32122039794922, "logps/rejected": -68.94384765625, "loss": 0.1589, "rewards/accuracies": 1.0, "rewards/chosen": 7.759392738342285, "rewards/margins": 1.3696436882019043, "rewards/rejected": 6.389749050140381, "step": 7846 }, { "epoch": 1.74, "learning_rate": 4.473158204966133e-07, "logits/chosen": -2.0297069549560547, "logits/rejected": -1.8802086114883423, "logps/chosen": -63.06568908691406, "logps/rejected": -22.181676864624023, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": 3.922917127609253, "rewards/margins": 2.5936334133148193, "rewards/rejected": 1.3292837142944336, "step": 7847 }, { "epoch": 1.74, "learning_rate": 4.465751144929559e-07, "logits/chosen": -1.592976689338684, "logits/rejected": -1.594663381576538, "logps/chosen": -10.697224617004395, "logps/rejected": -4.225257873535156, "loss": 1.9476, "rewards/accuracies": 0.0, "rewards/chosen": 0.5541586875915527, "rewards/margins": -0.5058633089065552, "rewards/rejected": 1.060021996498108, "step": 7848 }, { "epoch": 1.74, "learning_rate": 4.4583499359344973e-07, "logits/chosen": -1.9415203332901, "logits/rejected": -1.9053109884262085, "logps/chosen": -72.23738098144531, "logps/rejected": -43.026885986328125, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": 6.903382778167725, "rewards/margins": 2.6530513763427734, "rewards/rejected": 4.250331401824951, "step": 7849 }, { "epoch": 1.74, "learning_rate": 4.4509545789319986e-07, "logits/chosen": -1.700784683227539, "logits/rejected": -1.1969232559204102, "logps/chosen": -56.699989318847656, "logps/rejected": -60.70069885253906, "loss": 0.3566, "rewards/accuracies": 0.0, "rewards/chosen": 3.324221134185791, "rewards/margins": -0.035611629486083984, "rewards/rejected": 3.359832763671875, "step": 7850 }, { "epoch": 1.74, "learning_rate": 4.44356507487233e-07, "logits/chosen": -2.2783501148223877, "logits/rejected": -2.273810863494873, "logps/chosen": -62.22260284423828, "logps/rejected": -22.79445457458496, "loss": 0.1117, "rewards/accuracies": 1.0, "rewards/chosen": 7.632970333099365, "rewards/margins": 7.0866217613220215, "rewards/rejected": 0.5463483929634094, "step": 7851 }, { "epoch": 1.74, "learning_rate": 4.436181424705044e-07, "logits/chosen": -1.9296635389328003, "logits/rejected": -1.8925577402114868, "logps/chosen": -44.76256561279297, "logps/rejected": -68.20460510253906, "loss": 0.5366, "rewards/accuracies": 0.0, "rewards/chosen": 3.9909331798553467, "rewards/margins": -0.6463706493377686, "rewards/rejected": 4.637303829193115, "step": 7852 }, { "epoch": 1.74, "learning_rate": 4.4288036293789083e-07, "logits/chosen": -1.743905782699585, "logits/rejected": -1.7121599912643433, "logps/chosen": -49.189430236816406, "logps/rejected": -65.7784194946289, "loss": 0.4179, "rewards/accuracies": 0.0, "rewards/chosen": 4.419288158416748, "rewards/margins": -0.17945384979248047, "rewards/rejected": 4.5987420082092285, "step": 7853 }, { "epoch": 1.74, "learning_rate": 4.421431689841965e-07, "logits/chosen": -1.694359302520752, "logits/rejected": -1.7504621744155884, "logps/chosen": -82.99334716796875, "logps/rejected": -66.73026275634766, "loss": 0.1739, "rewards/accuracies": 1.0, "rewards/chosen": 5.8116044998168945, "rewards/margins": 0.8898940086364746, "rewards/rejected": 4.92171049118042, "step": 7854 }, { "epoch": 1.74, "learning_rate": 4.414065607041473e-07, "logits/chosen": -2.14504337310791, "logits/rejected": -2.1141514778137207, "logps/chosen": -100.69880676269531, "logps/rejected": -77.30130767822266, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": 11.078322410583496, "rewards/margins": 7.359848976135254, "rewards/rejected": 3.718473196029663, "step": 7855 }, { "epoch": 1.74, "learning_rate": 4.406705381923965e-07, "logits/chosen": -2.037583112716675, "logits/rejected": -2.062504768371582, "logps/chosen": -67.25508117675781, "logps/rejected": -111.46918487548828, "loss": 0.0942, "rewards/accuracies": 1.0, "rewards/chosen": 3.686741590499878, "rewards/margins": 1.8931373357772827, "rewards/rejected": 1.7936042547225952, "step": 7856 }, { "epoch": 1.74, "learning_rate": 4.3993510154352106e-07, "logits/chosen": -1.8932901620864868, "logits/rejected": -1.9378480911254883, "logps/chosen": -53.3489990234375, "logps/rejected": -56.3516845703125, "loss": 1.0661, "rewards/accuracies": 0.0, "rewards/chosen": 5.051284313201904, "rewards/margins": -2.0049304962158203, "rewards/rejected": 7.056214809417725, "step": 7857 }, { "epoch": 1.74, "learning_rate": 4.392002508520232e-07, "logits/chosen": -2.1968348026275635, "logits/rejected": -2.1650116443634033, "logps/chosen": -107.34840393066406, "logps/rejected": -69.66800689697266, "loss": 0.0876, "rewards/accuracies": 1.0, "rewards/chosen": 7.182473659515381, "rewards/margins": 2.537149429321289, "rewards/rejected": 4.645324230194092, "step": 7858 }, { "epoch": 1.74, "learning_rate": 4.3846598621232917e-07, "logits/chosen": -2.0005223751068115, "logits/rejected": -2.0005223751068115, "logps/chosen": -26.30648422241211, "logps/rejected": -26.30648422241211, "loss": 0.4963, "rewards/accuracies": 0.0, "rewards/chosen": 2.2328968048095703, "rewards/margins": 0.0, "rewards/rejected": 2.2328968048095703, "step": 7859 }, { "epoch": 1.74, "learning_rate": 4.3773230771879004e-07, "logits/chosen": -2.1017470359802246, "logits/rejected": -2.0723812580108643, "logps/chosen": -39.82274627685547, "logps/rejected": -63.308265686035156, "loss": 0.0935, "rewards/accuracies": 1.0, "rewards/chosen": 4.952503204345703, "rewards/margins": 1.949519395828247, "rewards/rejected": 3.002983808517456, "step": 7860 }, { "epoch": 1.74, "learning_rate": 4.369992154656821e-07, "logits/chosen": -1.7547962665557861, "logits/rejected": -1.8140714168548584, "logps/chosen": -58.72023391723633, "logps/rejected": -146.23236083984375, "loss": 0.1781, "rewards/accuracies": 1.0, "rewards/chosen": 7.319899082183838, "rewards/margins": 1.936192512512207, "rewards/rejected": 5.383706569671631, "step": 7861 }, { "epoch": 1.74, "learning_rate": 4.3626670954720616e-07, "logits/chosen": -1.816569209098816, "logits/rejected": -1.8673460483551025, "logps/chosen": -8.76200008392334, "logps/rejected": -28.96098518371582, "loss": 0.3129, "rewards/accuracies": 1.0, "rewards/chosen": 2.3183846473693848, "rewards/margins": 0.15838146209716797, "rewards/rejected": 2.160003185272217, "step": 7862 }, { "epoch": 1.74, "learning_rate": 4.35534790057488e-07, "logits/chosen": -1.8543974161148071, "logits/rejected": -1.7771724462509155, "logps/chosen": -26.535585403442383, "logps/rejected": -11.988089561462402, "loss": 0.1664, "rewards/accuracies": 1.0, "rewards/chosen": 2.209686040878296, "rewards/margins": 0.9360474348068237, "rewards/rejected": 1.2736386060714722, "step": 7863 }, { "epoch": 1.74, "learning_rate": 4.348034570905746e-07, "logits/chosen": -1.967791199684143, "logits/rejected": -1.9326926469802856, "logps/chosen": -96.93907928466797, "logps/rejected": -129.29995727539062, "loss": 0.0824, "rewards/accuracies": 1.0, "rewards/chosen": 8.036423683166504, "rewards/margins": 2.6238489151000977, "rewards/rejected": 5.412574768066406, "step": 7864 }, { "epoch": 1.74, "learning_rate": 4.3407271074044413e-07, "logits/chosen": -1.9962519407272339, "logits/rejected": -1.9695988893508911, "logps/chosen": -29.03546142578125, "logps/rejected": -57.575443267822266, "loss": 0.3072, "rewards/accuracies": 1.0, "rewards/chosen": 3.255389451980591, "rewards/margins": 0.4834582805633545, "rewards/rejected": 2.7719311714172363, "step": 7865 }, { "epoch": 1.74, "learning_rate": 4.333425511009937e-07, "logits/chosen": -1.8302299976348877, "logits/rejected": -1.7309397459030151, "logps/chosen": -62.27316665649414, "logps/rejected": -17.693824768066406, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": 3.8610012531280518, "rewards/margins": 3.386482000350952, "rewards/rejected": 0.47451916337013245, "step": 7866 }, { "epoch": 1.74, "learning_rate": 4.326129782660471e-07, "logits/chosen": -2.1528491973876953, "logits/rejected": -2.1840293407440186, "logps/chosen": -121.45101928710938, "logps/rejected": -101.91071319580078, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 9.813774108886719, "rewards/margins": 5.998882293701172, "rewards/rejected": 3.814891815185547, "step": 7867 }, { "epoch": 1.74, "learning_rate": 4.318839923293533e-07, "logits/chosen": -1.7751613855361938, "logits/rejected": -1.6978873014450073, "logps/chosen": -72.08979797363281, "logps/rejected": -34.72504425048828, "loss": 0.3516, "rewards/accuracies": 1.0, "rewards/chosen": 6.93300199508667, "rewards/margins": 4.773622512817383, "rewards/rejected": 2.159379243850708, "step": 7868 }, { "epoch": 1.74, "learning_rate": 4.3115559338458515e-07, "logits/chosen": -1.7329533100128174, "logits/rejected": -1.6532431840896606, "logps/chosen": -43.450462341308594, "logps/rejected": -33.94755554199219, "loss": 0.2036, "rewards/accuracies": 1.0, "rewards/chosen": 5.043843746185303, "rewards/margins": 1.7482802867889404, "rewards/rejected": 3.2955634593963623, "step": 7869 }, { "epoch": 1.74, "learning_rate": 4.30427781525341e-07, "logits/chosen": -1.8618509769439697, "logits/rejected": -1.8448207378387451, "logps/chosen": -80.21388244628906, "logps/rejected": -158.13198852539062, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": 10.913756370544434, "rewards/margins": 2.5130767822265625, "rewards/rejected": 8.400679588317871, "step": 7870 }, { "epoch": 1.74, "learning_rate": 4.2970055684514014e-07, "logits/chosen": -1.8430383205413818, "logits/rejected": -1.8333725929260254, "logps/chosen": -50.69755172729492, "logps/rejected": -71.7521743774414, "loss": 0.5226, "rewards/accuracies": 0.0, "rewards/chosen": 3.653543472290039, "rewards/margins": -0.6074328422546387, "rewards/rejected": 4.260976314544678, "step": 7871 }, { "epoch": 1.74, "learning_rate": 4.2897391943743263e-07, "logits/chosen": -1.4439438581466675, "logits/rejected": -1.4439438581466675, "logps/chosen": -3.2392666339874268, "logps/rejected": -3.2392666339874268, "loss": 0.3487, "rewards/accuracies": 0.0, "rewards/chosen": 0.6756641268730164, "rewards/margins": 0.0, "rewards/rejected": 0.6756641268730164, "step": 7872 }, { "epoch": 1.74, "learning_rate": 4.28247869395586e-07, "logits/chosen": -2.0785675048828125, "logits/rejected": -1.9935622215270996, "logps/chosen": -49.7723274230957, "logps/rejected": -7.985370635986328, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 4.620704174041748, "rewards/margins": 3.3361949920654297, "rewards/rejected": 1.2845090627670288, "step": 7873 }, { "epoch": 1.74, "learning_rate": 4.2752240681290027e-07, "logits/chosen": -2.1603033542633057, "logits/rejected": -2.14646053314209, "logps/chosen": -47.06572723388672, "logps/rejected": -67.6727294921875, "loss": 0.1599, "rewards/accuracies": 1.0, "rewards/chosen": 7.013617992401123, "rewards/margins": 1.3546466827392578, "rewards/rejected": 5.658971309661865, "step": 7874 }, { "epoch": 1.74, "learning_rate": 4.267975317825923e-07, "logits/chosen": -2.093336820602417, "logits/rejected": -2.070002317428589, "logps/chosen": -56.20283508300781, "logps/rejected": -60.71910858154297, "loss": 0.6805, "rewards/accuracies": 0.0, "rewards/chosen": 3.9056198596954346, "rewards/margins": -1.0296051502227783, "rewards/rejected": 4.935225009918213, "step": 7875 }, { "epoch": 1.74, "learning_rate": 4.260732443978077e-07, "logits/chosen": -1.8629100322723389, "logits/rejected": -1.8130720853805542, "logps/chosen": -45.250240325927734, "logps/rejected": -21.40792465209961, "loss": 0.2287, "rewards/accuracies": 1.0, "rewards/chosen": 4.209367752075195, "rewards/margins": 1.008915662765503, "rewards/rejected": 3.2004520893096924, "step": 7876 }, { "epoch": 1.74, "learning_rate": 4.253495447516159e-07, "logits/chosen": -1.80349862575531, "logits/rejected": -1.646446943283081, "logps/chosen": -95.67253112792969, "logps/rejected": -45.67232894897461, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 3.037275791168213, "rewards/margins": 2.8134472370147705, "rewards/rejected": 0.2238285094499588, "step": 7877 }, { "epoch": 1.74, "learning_rate": 4.2462643293701033e-07, "logits/chosen": -1.8442490100860596, "logits/rejected": -1.8159581422805786, "logps/chosen": -34.17927551269531, "logps/rejected": -66.79109191894531, "loss": 0.1457, "rewards/accuracies": 1.0, "rewards/chosen": 3.4336166381835938, "rewards/margins": 1.1988906860351562, "rewards/rejected": 2.2347259521484375, "step": 7878 }, { "epoch": 1.74, "learning_rate": 4.2390390904691045e-07, "logits/chosen": -2.309817314147949, "logits/rejected": -2.304532051086426, "logps/chosen": -42.12497329711914, "logps/rejected": -76.97709655761719, "loss": 1.2084, "rewards/accuracies": 0.0, "rewards/chosen": 5.878634452819824, "rewards/margins": -2.2057485580444336, "rewards/rejected": 8.084383010864258, "step": 7879 }, { "epoch": 1.74, "learning_rate": 4.2318197317415545e-07, "logits/chosen": -1.832761287689209, "logits/rejected": -1.800834059715271, "logps/chosen": -34.201683044433594, "logps/rejected": -53.05250549316406, "loss": 1.3124, "rewards/accuracies": 0.0, "rewards/chosen": 2.1267545223236084, "rewards/margins": -2.535496950149536, "rewards/rejected": 4.6622514724731445, "step": 7880 }, { "epoch": 1.74, "learning_rate": 4.2246062541151653e-07, "logits/chosen": -2.3443126678466797, "logits/rejected": -2.3530919551849365, "logps/chosen": -67.40988159179688, "logps/rejected": -99.32698059082031, "loss": 0.1388, "rewards/accuracies": 1.0, "rewards/chosen": 7.831398010253906, "rewards/margins": 1.1431059837341309, "rewards/rejected": 6.688292026519775, "step": 7881 }, { "epoch": 1.74, "learning_rate": 4.2173986585168136e-07, "logits/chosen": -2.016879081726074, "logits/rejected": -1.9212418794631958, "logps/chosen": -64.33615112304688, "logps/rejected": -13.532011985778809, "loss": 0.0732, "rewards/accuracies": 1.0, "rewards/chosen": 4.27798318862915, "rewards/margins": 3.3835690021514893, "rewards/rejected": 0.8944142460823059, "step": 7882 }, { "epoch": 1.74, "learning_rate": 4.210196945872691e-07, "logits/chosen": -1.7134928703308105, "logits/rejected": -1.498015284538269, "logps/chosen": -88.1277084350586, "logps/rejected": -19.11156463623047, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": 4.977180004119873, "rewards/margins": 3.5867738723754883, "rewards/rejected": 1.3904060125350952, "step": 7883 }, { "epoch": 1.75, "learning_rate": 4.2030011171081687e-07, "logits/chosen": -2.05069899559021, "logits/rejected": -1.998655080795288, "logps/chosen": -91.10609436035156, "logps/rejected": -72.85054016113281, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 7.0808610916137695, "rewards/margins": 4.291532516479492, "rewards/rejected": 2.7893288135528564, "step": 7884 }, { "epoch": 1.75, "learning_rate": 4.195811173147929e-07, "logits/chosen": -2.0253188610076904, "logits/rejected": -2.0029408931732178, "logps/chosen": -47.63761901855469, "logps/rejected": -50.31330490112305, "loss": 0.229, "rewards/accuracies": 1.0, "rewards/chosen": 4.339601039886475, "rewards/margins": 1.0044763088226318, "rewards/rejected": 3.3351247310638428, "step": 7885 }, { "epoch": 1.75, "learning_rate": 4.188627114915833e-07, "logits/chosen": -2.0357398986816406, "logits/rejected": -2.021912097930908, "logps/chosen": -46.20513153076172, "logps/rejected": -56.27491760253906, "loss": 0.3307, "rewards/accuracies": 1.0, "rewards/chosen": 3.1651360988616943, "rewards/margins": 0.18719482421875, "rewards/rejected": 2.9779412746429443, "step": 7886 }, { "epoch": 1.75, "learning_rate": 4.181448943335026e-07, "logits/chosen": -2.0742781162261963, "logits/rejected": -1.9943236112594604, "logps/chosen": -49.61293029785156, "logps/rejected": -17.11041259765625, "loss": 0.2123, "rewards/accuracies": 1.0, "rewards/chosen": 3.7798264026641846, "rewards/margins": 3.1383183002471924, "rewards/rejected": 0.6415081024169922, "step": 7887 }, { "epoch": 1.75, "learning_rate": 4.1742766593278974e-07, "logits/chosen": -1.957425832748413, "logits/rejected": -1.7888909578323364, "logps/chosen": -104.84835815429688, "logps/rejected": -40.46733474731445, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": 6.116720676422119, "rewards/margins": 5.637393474578857, "rewards/rejected": 0.4793270230293274, "step": 7888 }, { "epoch": 1.75, "learning_rate": 4.167110263816032e-07, "logits/chosen": -2.058971405029297, "logits/rejected": -2.086883544921875, "logps/chosen": -50.25229263305664, "logps/rejected": -39.03694152832031, "loss": 0.6075, "rewards/accuracies": 0.0, "rewards/chosen": 0.9901108145713806, "rewards/margins": -0.77744060754776, "rewards/rejected": 1.7675514221191406, "step": 7889 }, { "epoch": 1.75, "learning_rate": 4.1599497577203385e-07, "logits/chosen": -2.0540621280670166, "logits/rejected": -2.1074776649475098, "logps/chosen": -45.01740264892578, "logps/rejected": -126.62321472167969, "loss": 1.4613, "rewards/accuracies": 0.0, "rewards/chosen": 8.053818702697754, "rewards/margins": -2.8668432235717773, "rewards/rejected": 10.920661926269531, "step": 7890 }, { "epoch": 1.75, "learning_rate": 4.152795141960886e-07, "logits/chosen": -1.7165613174438477, "logits/rejected": -1.7224822044372559, "logps/chosen": -8.518660545349121, "logps/rejected": -3.437825918197632, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 2.9788613319396973, "rewards/margins": 2.378133773803711, "rewards/rejected": 0.6007276773452759, "step": 7891 }, { "epoch": 1.75, "learning_rate": 4.1456464174570666e-07, "logits/chosen": -2.2086586952209473, "logits/rejected": -2.2026171684265137, "logps/chosen": -55.600547790527344, "logps/rejected": -88.17879486083984, "loss": 1.4724, "rewards/accuracies": 0.0, "rewards/chosen": 4.083324432373047, "rewards/margins": -2.866856575012207, "rewards/rejected": 6.950181007385254, "step": 7892 }, { "epoch": 1.75, "learning_rate": 4.1385035851274403e-07, "logits/chosen": -2.1879265308380127, "logits/rejected": -2.1677467823028564, "logps/chosen": -51.29448699951172, "logps/rejected": -94.42739868164062, "loss": 0.3053, "rewards/accuracies": 1.0, "rewards/chosen": 5.6449174880981445, "rewards/margins": 0.18234586715698242, "rewards/rejected": 5.462571620941162, "step": 7893 }, { "epoch": 1.75, "learning_rate": 4.131366645889856e-07, "logits/chosen": -1.7559728622436523, "logits/rejected": -1.2766692638397217, "logps/chosen": -87.48433685302734, "logps/rejected": -49.349571228027344, "loss": 0.1318, "rewards/accuracies": 1.0, "rewards/chosen": 6.4651618003845215, "rewards/margins": 1.904935359954834, "rewards/rejected": 4.5602264404296875, "step": 7894 }, { "epoch": 1.75, "learning_rate": 4.124235600661386e-07, "logits/chosen": -2.063559055328369, "logits/rejected": -2.0240557193756104, "logps/chosen": -58.09086608886719, "logps/rejected": -52.36827087402344, "loss": 0.2735, "rewards/accuracies": 1.0, "rewards/chosen": 2.7775604724884033, "rewards/margins": 0.5230681896209717, "rewards/rejected": 2.2544922828674316, "step": 7895 }, { "epoch": 1.75, "learning_rate": 4.117110450358369e-07, "logits/chosen": -2.0196938514709473, "logits/rejected": -1.9840635061264038, "logps/chosen": -51.23367691040039, "logps/rejected": -63.387733459472656, "loss": 0.2524, "rewards/accuracies": 1.0, "rewards/chosen": 4.442539691925049, "rewards/margins": 0.498335599899292, "rewards/rejected": 3.944204092025757, "step": 7896 }, { "epoch": 1.75, "learning_rate": 4.1099911958963625e-07, "logits/chosen": -1.882044792175293, "logits/rejected": -1.868431806564331, "logps/chosen": -96.36068725585938, "logps/rejected": -83.3583984375, "loss": 0.0587, "rewards/accuracies": 1.0, "rewards/chosen": 5.882838726043701, "rewards/margins": 2.5915400981903076, "rewards/rejected": 3.2912986278533936, "step": 7897 }, { "epoch": 1.75, "learning_rate": 4.102877838190156e-07, "logits/chosen": -1.974162220954895, "logits/rejected": -1.959725260734558, "logps/chosen": -68.99359130859375, "logps/rejected": -78.99488830566406, "loss": 0.1496, "rewards/accuracies": 1.0, "rewards/chosen": 5.478331089019775, "rewards/margins": 1.0939888954162598, "rewards/rejected": 4.384342193603516, "step": 7898 }, { "epoch": 1.75, "learning_rate": 4.095770378153829e-07, "logits/chosen": -1.856832504272461, "logits/rejected": -1.8291959762573242, "logps/chosen": -46.164913177490234, "logps/rejected": -76.32475280761719, "loss": 2.2809, "rewards/accuracies": 0.0, "rewards/chosen": 3.297652006149292, "rewards/margins": -0.283189058303833, "rewards/rejected": 3.580841064453125, "step": 7899 }, { "epoch": 1.75, "learning_rate": 4.0886688167006415e-07, "logits/chosen": -1.8614041805267334, "logits/rejected": -1.8404700756072998, "logps/chosen": -26.365629196166992, "logps/rejected": -27.66719627380371, "loss": 0.3042, "rewards/accuracies": 1.0, "rewards/chosen": 1.924127459526062, "rewards/margins": 0.41502153873443604, "rewards/rejected": 1.509105920791626, "step": 7900 }, { "epoch": 1.75, "learning_rate": 4.081573154743157e-07, "logits/chosen": -1.9383823871612549, "logits/rejected": -1.9234950542449951, "logps/chosen": -55.373992919921875, "logps/rejected": -90.74281311035156, "loss": 0.2018, "rewards/accuracies": 1.0, "rewards/chosen": 2.994593858718872, "rewards/margins": 1.3004226684570312, "rewards/rejected": 1.6941711902618408, "step": 7901 }, { "epoch": 1.75, "learning_rate": 4.074483393193135e-07, "logits/chosen": -2.23069429397583, "logits/rejected": -2.194432020187378, "logps/chosen": -92.197021484375, "logps/rejected": -36.309303283691406, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 7.521939277648926, "rewards/margins": 2.6897330284118652, "rewards/rejected": 4.8322062492370605, "step": 7902 }, { "epoch": 1.75, "learning_rate": 4.0673995329615913e-07, "logits/chosen": -1.9256385564804077, "logits/rejected": -1.8963371515274048, "logps/chosen": -133.59487915039062, "logps/rejected": -96.55718994140625, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": 10.02762508392334, "rewards/margins": 3.4476475715637207, "rewards/rejected": 6.579977512359619, "step": 7903 }, { "epoch": 1.75, "learning_rate": 4.0603215749587864e-07, "logits/chosen": -1.7544665336608887, "logits/rejected": -1.7525376081466675, "logps/chosen": -37.391136169433594, "logps/rejected": -66.70664978027344, "loss": 0.1243, "rewards/accuracies": 1.0, "rewards/chosen": 4.337314128875732, "rewards/margins": 1.27125883102417, "rewards/rejected": 3.0660552978515625, "step": 7904 }, { "epoch": 1.75, "learning_rate": 4.0532495200942266e-07, "logits/chosen": -2.1135847568511963, "logits/rejected": -2.0805280208587646, "logps/chosen": -42.26665496826172, "logps/rejected": -70.02328491210938, "loss": 1.1325, "rewards/accuracies": 1.0, "rewards/chosen": 4.942513942718506, "rewards/margins": 1.0231611728668213, "rewards/rejected": 3.9193527698516846, "step": 7905 }, { "epoch": 1.75, "learning_rate": 4.046183369276657e-07, "logits/chosen": -2.1876895427703857, "logits/rejected": -2.139493942260742, "logps/chosen": -84.06292724609375, "logps/rejected": -82.65237426757812, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 9.337717056274414, "rewards/margins": 5.162424087524414, "rewards/rejected": 4.17529296875, "step": 7906 }, { "epoch": 1.75, "learning_rate": 4.0391231234140403e-07, "logits/chosen": -1.9294556379318237, "logits/rejected": -1.9460192918777466, "logps/chosen": -45.722129821777344, "logps/rejected": -78.47982788085938, "loss": 0.2908, "rewards/accuracies": 1.0, "rewards/chosen": 5.366976261138916, "rewards/margins": 0.7071418762207031, "rewards/rejected": 4.659834384918213, "step": 7907 }, { "epoch": 1.75, "learning_rate": 4.032068783413634e-07, "logits/chosen": -1.6651420593261719, "logits/rejected": -1.7105209827423096, "logps/chosen": -45.67536163330078, "logps/rejected": -52.734718322753906, "loss": 0.3366, "rewards/accuracies": 1.0, "rewards/chosen": 4.819455623626709, "rewards/margins": 0.1336355209350586, "rewards/rejected": 4.68582010269165, "step": 7908 }, { "epoch": 1.75, "learning_rate": 4.025020350181863e-07, "logits/chosen": -1.714015007019043, "logits/rejected": -1.574668049812317, "logps/chosen": -78.32998657226562, "logps/rejected": -32.78203201293945, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": 3.2370057106018066, "rewards/margins": 2.155304431915283, "rewards/rejected": 1.0817012786865234, "step": 7909 }, { "epoch": 1.75, "learning_rate": 4.017977824624475e-07, "logits/chosen": -1.5445595979690552, "logits/rejected": -1.6010481119155884, "logps/chosen": -56.49962615966797, "logps/rejected": -73.7588119506836, "loss": 2.4081, "rewards/accuracies": 0.0, "rewards/chosen": 3.1958374977111816, "rewards/margins": -4.8003973960876465, "rewards/rejected": 7.996234893798828, "step": 7910 }, { "epoch": 1.75, "learning_rate": 4.0109412076463905e-07, "logits/chosen": -1.948918104171753, "logits/rejected": -1.7100962400436401, "logps/chosen": -104.64664459228516, "logps/rejected": -65.74594116210938, "loss": 0.131, "rewards/accuracies": 1.0, "rewards/chosen": 5.020487308502197, "rewards/margins": 2.7736122608184814, "rewards/rejected": 2.246875047683716, "step": 7911 }, { "epoch": 1.75, "learning_rate": 4.0039105001518084e-07, "logits/chosen": -2.111785650253296, "logits/rejected": -2.134347677230835, "logps/chosen": -24.956310272216797, "logps/rejected": -75.84880065917969, "loss": 1.6297, "rewards/accuracies": 1.0, "rewards/chosen": 3.1009819507598877, "rewards/margins": 1.45037841796875, "rewards/rejected": 1.6506035327911377, "step": 7912 }, { "epoch": 1.75, "learning_rate": 3.996885703044151e-07, "logits/chosen": -1.6887441873550415, "logits/rejected": -1.1171239614486694, "logps/chosen": -21.385364532470703, "logps/rejected": -98.2552719116211, "loss": 1.776, "rewards/accuracies": 0.0, "rewards/chosen": 2.06701397895813, "rewards/margins": -3.3175160884857178, "rewards/rejected": 5.384530067443848, "step": 7913 }, { "epoch": 1.75, "learning_rate": 3.9898668172260956e-07, "logits/chosen": -1.8696725368499756, "logits/rejected": -1.771944284439087, "logps/chosen": -114.86968231201172, "logps/rejected": -93.48387145996094, "loss": 0.3539, "rewards/accuracies": 1.0, "rewards/chosen": 5.883799076080322, "rewards/margins": 4.214334964752197, "rewards/rejected": 1.669464111328125, "step": 7914 }, { "epoch": 1.75, "learning_rate": 3.982853843599549e-07, "logits/chosen": -1.7655225992202759, "logits/rejected": -1.7164807319641113, "logps/chosen": -30.787818908691406, "logps/rejected": -62.20112991333008, "loss": 0.1887, "rewards/accuracies": 1.0, "rewards/chosen": 3.5545761585235596, "rewards/margins": 0.9741039276123047, "rewards/rejected": 2.580472230911255, "step": 7915 }, { "epoch": 1.75, "learning_rate": 3.9758467830656623e-07, "logits/chosen": -1.804701566696167, "logits/rejected": -1.8636058568954468, "logps/chosen": -36.774169921875, "logps/rejected": -83.85993194580078, "loss": 1.3849, "rewards/accuracies": 0.0, "rewards/chosen": 4.100712776184082, "rewards/margins": -2.704803943634033, "rewards/rejected": 6.805516719818115, "step": 7916 }, { "epoch": 1.75, "learning_rate": 3.9688456365248316e-07, "logits/chosen": -2.0367379188537598, "logits/rejected": -2.030580997467041, "logps/chosen": -62.90779495239258, "logps/rejected": -84.46708679199219, "loss": 1.087, "rewards/accuracies": 1.0, "rewards/chosen": 3.9486196041107178, "rewards/margins": 1.366927146911621, "rewards/rejected": 2.5816924571990967, "step": 7917 }, { "epoch": 1.75, "learning_rate": 3.9618504048766605e-07, "logits/chosen": -1.7374541759490967, "logits/rejected": -1.7851300239562988, "logps/chosen": -58.99535369873047, "logps/rejected": -142.92611694335938, "loss": 0.4104, "rewards/accuracies": 0.0, "rewards/chosen": 8.006434440612793, "rewards/margins": -0.13871383666992188, "rewards/rejected": 8.145148277282715, "step": 7918 }, { "epoch": 1.75, "learning_rate": 3.9548610890200625e-07, "logits/chosen": -1.6549359560012817, "logits/rejected": -1.6305071115493774, "logps/chosen": -34.91355895996094, "logps/rejected": -43.63892364501953, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": 2.900106191635132, "rewards/margins": 1.7386746406555176, "rewards/rejected": 1.1614315509796143, "step": 7919 }, { "epoch": 1.75, "learning_rate": 3.9478776898531135e-07, "logits/chosen": -1.9996551275253296, "logits/rejected": -1.9832534790039062, "logps/chosen": -55.38777542114258, "logps/rejected": -82.31173706054688, "loss": 0.1661, "rewards/accuracies": 1.0, "rewards/chosen": 3.6423518657684326, "rewards/margins": 0.9444377422332764, "rewards/rejected": 2.6979141235351562, "step": 7920 }, { "epoch": 1.75, "learning_rate": 3.940900208273174e-07, "logits/chosen": -1.8854349851608276, "logits/rejected": -1.8567488193511963, "logps/chosen": -81.42194366455078, "logps/rejected": -40.60957336425781, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": 8.473816871643066, "rewards/margins": 4.571066856384277, "rewards/rejected": 3.90274977684021, "step": 7921 }, { "epoch": 1.75, "learning_rate": 3.933928645176838e-07, "logits/chosen": -1.9634374380111694, "logits/rejected": -1.8642586469650269, "logps/chosen": -78.1982421875, "logps/rejected": -51.18363571166992, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": 6.145022869110107, "rewards/margins": 3.2202658653259277, "rewards/rejected": 2.9247570037841797, "step": 7922 }, { "epoch": 1.75, "learning_rate": 3.926963001459927e-07, "logits/chosen": -1.8563722372055054, "logits/rejected": -1.849035620689392, "logps/chosen": -76.01377868652344, "logps/rejected": -79.26375579833984, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": 6.84079122543335, "rewards/margins": 4.289444923400879, "rewards/rejected": 2.55134654045105, "step": 7923 }, { "epoch": 1.75, "learning_rate": 3.9200032780175203e-07, "logits/chosen": -1.663757085800171, "logits/rejected": -1.6499857902526855, "logps/chosen": -74.57512664794922, "logps/rejected": -111.60054016113281, "loss": 0.0581, "rewards/accuracies": 1.0, "rewards/chosen": 2.8334038257598877, "rewards/margins": 2.135988712310791, "rewards/rejected": 0.6974151730537415, "step": 7924 }, { "epoch": 1.75, "learning_rate": 3.913049475743913e-07, "logits/chosen": -1.990136981010437, "logits/rejected": -2.082772731781006, "logps/chosen": -38.696083068847656, "logps/rejected": -153.37289428710938, "loss": 0.1171, "rewards/accuracies": 1.0, "rewards/chosen": 8.352202415466309, "rewards/margins": 1.4308843612670898, "rewards/rejected": 6.921318054199219, "step": 7925 }, { "epoch": 1.75, "learning_rate": 3.9061015955326686e-07, "logits/chosen": -2.089200735092163, "logits/rejected": -1.956230878829956, "logps/chosen": -189.01364135742188, "logps/rejected": -50.574459075927734, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": 8.26273250579834, "rewards/margins": 3.1149725914001465, "rewards/rejected": 5.147759914398193, "step": 7926 }, { "epoch": 1.75, "learning_rate": 3.8991596382765453e-07, "logits/chosen": -1.8405512571334839, "logits/rejected": -1.8147159814834595, "logps/chosen": -62.91132736206055, "logps/rejected": -56.90559387207031, "loss": 0.5985, "rewards/accuracies": 1.0, "rewards/chosen": 3.5313327312469482, "rewards/margins": 1.5195116996765137, "rewards/rejected": 2.0118210315704346, "step": 7927 }, { "epoch": 1.75, "learning_rate": 3.8922236048676064e-07, "logits/chosen": -1.7770259380340576, "logits/rejected": -1.7770259380340576, "logps/chosen": -31.944046020507812, "logps/rejected": -31.944046020507812, "loss": 0.4407, "rewards/accuracies": 0.0, "rewards/chosen": 1.5498383045196533, "rewards/margins": 0.0, "rewards/rejected": 1.5498383045196533, "step": 7928 }, { "epoch": 1.75, "learning_rate": 3.885293496197079e-07, "logits/chosen": -1.699543833732605, "logits/rejected": -1.716039776802063, "logps/chosen": -17.98102569580078, "logps/rejected": -66.34202575683594, "loss": 3.166, "rewards/accuracies": 0.0, "rewards/chosen": 2.755873203277588, "rewards/margins": -3.3017578125, "rewards/rejected": 6.057631015777588, "step": 7929 }, { "epoch": 1.76, "learning_rate": 3.8783693131554836e-07, "logits/chosen": -1.8924849033355713, "logits/rejected": -1.6889172792434692, "logps/chosen": -191.47817993164062, "logps/rejected": -23.209604263305664, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 7.010293483734131, "rewards/margins": 6.8804030418396, "rewards/rejected": 0.12989024817943573, "step": 7930 }, { "epoch": 1.76, "learning_rate": 3.871451056632558e-07, "logits/chosen": -2.0639843940734863, "logits/rejected": -2.0705933570861816, "logps/chosen": -21.943452835083008, "logps/rejected": -62.54188919067383, "loss": 0.6147, "rewards/accuracies": 0.0, "rewards/chosen": 3.864447832107544, "rewards/margins": -0.006743192672729492, "rewards/rejected": 3.8711910247802734, "step": 7931 }, { "epoch": 1.76, "learning_rate": 3.8645387275172807e-07, "logits/chosen": -1.8294429779052734, "logits/rejected": -1.813873529434204, "logps/chosen": -52.59494400024414, "logps/rejected": -63.3875732421875, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 5.974164962768555, "rewards/margins": 2.2806499004364014, "rewards/rejected": 3.6935150623321533, "step": 7932 }, { "epoch": 1.76, "learning_rate": 3.8576323266978744e-07, "logits/chosen": -1.7603368759155273, "logits/rejected": -1.8691498041152954, "logps/chosen": -26.597332000732422, "logps/rejected": -135.9984130859375, "loss": 0.5795, "rewards/accuracies": 0.0, "rewards/chosen": 4.502692699432373, "rewards/margins": -0.7743840217590332, "rewards/rejected": 5.277076721191406, "step": 7933 }, { "epoch": 1.76, "learning_rate": 3.85073185506179e-07, "logits/chosen": -1.7609455585479736, "logits/rejected": -1.707479476928711, "logps/chosen": -37.865779876708984, "logps/rejected": -47.52650451660156, "loss": 0.2334, "rewards/accuracies": 1.0, "rewards/chosen": 4.694050312042236, "rewards/margins": 0.5576305389404297, "rewards/rejected": 4.136419773101807, "step": 7934 }, { "epoch": 1.76, "learning_rate": 3.8438373134957186e-07, "logits/chosen": -1.8090189695358276, "logits/rejected": -1.8090189695358276, "logps/chosen": -22.050514221191406, "logps/rejected": -22.050514221191406, "loss": 0.4336, "rewards/accuracies": 0.0, "rewards/chosen": 4.034176826477051, "rewards/margins": 0.0, "rewards/rejected": 4.034176826477051, "step": 7935 }, { "epoch": 1.76, "learning_rate": 3.8369487028855956e-07, "logits/chosen": -1.734031081199646, "logits/rejected": -1.7974902391433716, "logps/chosen": -47.549190521240234, "logps/rejected": -117.4771957397461, "loss": 0.2717, "rewards/accuracies": 1.0, "rewards/chosen": 8.411493301391602, "rewards/margins": 0.466646671295166, "rewards/rejected": 7.9448466300964355, "step": 7936 }, { "epoch": 1.76, "learning_rate": 3.8300660241166023e-07, "logits/chosen": -2.1244046688079834, "logits/rejected": -2.1041133403778076, "logps/chosen": -27.34616470336914, "logps/rejected": -59.689483642578125, "loss": 0.7431, "rewards/accuracies": 0.0, "rewards/chosen": 1.8530758619308472, "rewards/margins": -0.7088366746902466, "rewards/rejected": 2.5619125366210938, "step": 7937 }, { "epoch": 1.76, "learning_rate": 3.8231892780731305e-07, "logits/chosen": -1.7770133018493652, "logits/rejected": -1.8232614994049072, "logps/chosen": -49.709251403808594, "logps/rejected": -123.9137954711914, "loss": 0.8988, "rewards/accuracies": 0.0, "rewards/chosen": 5.126473426818848, "rewards/margins": -1.5438690185546875, "rewards/rejected": 6.670342445373535, "step": 7938 }, { "epoch": 1.76, "learning_rate": 3.816318465638824e-07, "logits/chosen": -1.794924259185791, "logits/rejected": -1.8246597051620483, "logps/chosen": -20.887859344482422, "logps/rejected": -43.73615264892578, "loss": 0.5771, "rewards/accuracies": 0.0, "rewards/chosen": 2.959291458129883, "rewards/margins": -0.6885838508605957, "rewards/rejected": 3.6478753089904785, "step": 7939 }, { "epoch": 1.76, "learning_rate": 3.809453587696577e-07, "logits/chosen": -2.0258238315582275, "logits/rejected": -2.046938896179199, "logps/chosen": -85.7983169555664, "logps/rejected": -140.60179138183594, "loss": 0.3088, "rewards/accuracies": 1.0, "rewards/chosen": 8.532195091247559, "rewards/margins": 0.5086870193481445, "rewards/rejected": 8.023508071899414, "step": 7940 }, { "epoch": 1.76, "learning_rate": 3.8025946451285045e-07, "logits/chosen": -1.8702573776245117, "logits/rejected": -1.8046765327453613, "logps/chosen": -34.055763244628906, "logps/rejected": -29.138376235961914, "loss": 0.2625, "rewards/accuracies": 1.0, "rewards/chosen": 2.589299440383911, "rewards/margins": 0.8606947660446167, "rewards/rejected": 1.7286046743392944, "step": 7941 }, { "epoch": 1.76, "learning_rate": 3.795741638815958e-07, "logits/chosen": -2.3038058280944824, "logits/rejected": -2.313523054122925, "logps/chosen": -80.6968994140625, "logps/rejected": -112.57195281982422, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 6.19793701171875, "rewards/margins": 4.453461647033691, "rewards/rejected": 1.7444756031036377, "step": 7942 }, { "epoch": 1.76, "learning_rate": 3.7888945696395453e-07, "logits/chosen": -2.0356976985931396, "logits/rejected": -1.9340956211090088, "logps/chosen": -113.29623413085938, "logps/rejected": -77.71925354003906, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": 6.096412658691406, "rewards/margins": 3.216230630874634, "rewards/rejected": 2.8801820278167725, "step": 7943 }, { "epoch": 1.76, "learning_rate": 3.782053438479094e-07, "logits/chosen": -2.3772194385528564, "logits/rejected": -2.324399948120117, "logps/chosen": -57.63676071166992, "logps/rejected": -25.704471588134766, "loss": 0.1198, "rewards/accuracies": 1.0, "rewards/chosen": 3.2061169147491455, "rewards/margins": 1.8408088684082031, "rewards/rejected": 1.3653080463409424, "step": 7944 }, { "epoch": 1.76, "learning_rate": 3.7752182462136744e-07, "logits/chosen": -1.9306341409683228, "logits/rejected": -1.9304906129837036, "logps/chosen": -48.079917907714844, "logps/rejected": -91.69637298583984, "loss": 0.0944, "rewards/accuracies": 1.0, "rewards/chosen": 4.409072399139404, "rewards/margins": 1.666762113571167, "rewards/rejected": 2.7423102855682373, "step": 7945 }, { "epoch": 1.76, "learning_rate": 3.7683889937215833e-07, "logits/chosen": -1.53764009475708, "logits/rejected": -1.6056628227233887, "logps/chosen": -11.319235801696777, "logps/rejected": -73.33464050292969, "loss": 1.7796, "rewards/accuracies": 0.0, "rewards/chosen": 2.3248956203460693, "rewards/margins": -3.3976857662200928, "rewards/rejected": 5.722581386566162, "step": 7946 }, { "epoch": 1.76, "learning_rate": 3.761565681880369e-07, "logits/chosen": -1.7172685861587524, "logits/rejected": -1.7120189666748047, "logps/chosen": -51.57048797607422, "logps/rejected": -35.729034423828125, "loss": 0.1975, "rewards/accuracies": 1.0, "rewards/chosen": 4.467400550842285, "rewards/margins": 0.9193201065063477, "rewards/rejected": 3.5480804443359375, "step": 7947 }, { "epoch": 1.76, "learning_rate": 3.7547483115668083e-07, "logits/chosen": -1.9888801574707031, "logits/rejected": -1.8680155277252197, "logps/chosen": -165.51947021484375, "logps/rejected": -71.67593383789062, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 6.258917331695557, "rewards/margins": 3.3566207885742188, "rewards/rejected": 2.902296543121338, "step": 7948 }, { "epoch": 1.76, "learning_rate": 3.747936883656916e-07, "logits/chosen": -1.6878767013549805, "logits/rejected": -1.7471063137054443, "logps/chosen": -31.95523452758789, "logps/rejected": -47.298828125, "loss": 0.4194, "rewards/accuracies": 0.0, "rewards/chosen": 3.468230962753296, "rewards/margins": -0.257824182510376, "rewards/rejected": 3.726055145263672, "step": 7949 }, { "epoch": 1.76, "learning_rate": 3.7411313990259546e-07, "logits/chosen": -2.224078893661499, "logits/rejected": -2.228114604949951, "logps/chosen": -48.78318786621094, "logps/rejected": -129.44793701171875, "loss": 1.9788, "rewards/accuracies": 0.0, "rewards/chosen": 3.6467971801757812, "rewards/margins": -3.931666851043701, "rewards/rejected": 7.578464031219482, "step": 7950 }, { "epoch": 1.76, "learning_rate": 3.7343318585484014e-07, "logits/chosen": -1.9617232084274292, "logits/rejected": -1.838158130645752, "logps/chosen": -108.66278839111328, "logps/rejected": -29.867918014526367, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 6.712657928466797, "rewards/margins": 5.926769256591797, "rewards/rejected": 0.7858888506889343, "step": 7951 }, { "epoch": 1.76, "learning_rate": 3.727538263097985e-07, "logits/chosen": -2.119283676147461, "logits/rejected": -1.8047113418579102, "logps/chosen": -41.03436279296875, "logps/rejected": -213.75689697265625, "loss": 1.9099, "rewards/accuracies": 0.0, "rewards/chosen": 6.230456829071045, "rewards/margins": -3.6731600761413574, "rewards/rejected": 9.903616905212402, "step": 7952 }, { "epoch": 1.76, "learning_rate": 3.7207506135476637e-07, "logits/chosen": -1.9446210861206055, "logits/rejected": -1.914489507675171, "logps/chosen": -48.07880401611328, "logps/rejected": -69.22989654541016, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": 3.2539894580841064, "rewards/margins": 1.1969552040100098, "rewards/rejected": 2.0570342540740967, "step": 7953 }, { "epoch": 1.76, "learning_rate": 3.7139689107696496e-07, "logits/chosen": -2.016547918319702, "logits/rejected": -2.0049614906311035, "logps/chosen": -40.46513366699219, "logps/rejected": -72.36026000976562, "loss": 0.0884, "rewards/accuracies": 1.0, "rewards/chosen": 5.4837846755981445, "rewards/margins": 1.8133256435394287, "rewards/rejected": 3.670459032058716, "step": 7954 }, { "epoch": 1.76, "learning_rate": 3.707193155635341e-07, "logits/chosen": -1.6809061765670776, "logits/rejected": -1.5218502283096313, "logps/chosen": -129.89077758789062, "logps/rejected": -41.400081634521484, "loss": 0.1365, "rewards/accuracies": 1.0, "rewards/chosen": 4.490713596343994, "rewards/margins": 4.311551570892334, "rewards/rejected": 0.17916183173656464, "step": 7955 }, { "epoch": 1.76, "learning_rate": 3.7004233490154475e-07, "logits/chosen": -1.6122684478759766, "logits/rejected": -1.5467092990875244, "logps/chosen": -26.219833374023438, "logps/rejected": -48.18157196044922, "loss": 0.2899, "rewards/accuracies": 1.0, "rewards/chosen": 2.540834426879883, "rewards/margins": 1.0712729692459106, "rewards/rejected": 1.4695614576339722, "step": 7956 }, { "epoch": 1.76, "learning_rate": 3.6936594917798386e-07, "logits/chosen": -1.7267571687698364, "logits/rejected": -1.7421984672546387, "logps/chosen": -18.346721649169922, "logps/rejected": -69.00546264648438, "loss": 0.8972, "rewards/accuracies": 0.0, "rewards/chosen": 4.456653118133545, "rewards/margins": -1.5606565475463867, "rewards/rejected": 6.017309665679932, "step": 7957 }, { "epoch": 1.76, "learning_rate": 3.686901584797675e-07, "logits/chosen": -2.0007174015045166, "logits/rejected": -1.9998819828033447, "logps/chosen": -28.116958618164062, "logps/rejected": -94.88432312011719, "loss": 0.1708, "rewards/accuracies": 1.0, "rewards/chosen": 3.75895094871521, "rewards/margins": 0.9888114929199219, "rewards/rejected": 2.770139455795288, "step": 7958 }, { "epoch": 1.76, "learning_rate": 3.6801496289373186e-07, "logits/chosen": -1.9285573959350586, "logits/rejected": -1.917262315750122, "logps/chosen": -14.87446403503418, "logps/rejected": -37.162933349609375, "loss": 0.7917, "rewards/accuracies": 0.0, "rewards/chosen": 3.225175619125366, "rewards/margins": -0.5927984714508057, "rewards/rejected": 3.817974090576172, "step": 7959 }, { "epoch": 1.76, "learning_rate": 3.673403625066385e-07, "logits/chosen": -1.718908667564392, "logits/rejected": -1.718908667564392, "logps/chosen": -69.4760513305664, "logps/rejected": -69.4760513305664, "loss": 0.3466, "rewards/accuracies": 0.0, "rewards/chosen": 4.126992225646973, "rewards/margins": 0.0, "rewards/rejected": 4.126992225646973, "step": 7960 }, { "epoch": 1.76, "learning_rate": 3.666663574051721e-07, "logits/chosen": -1.8298838138580322, "logits/rejected": -1.8303316831588745, "logps/chosen": -35.912498474121094, "logps/rejected": -37.81284713745117, "loss": 0.3067, "rewards/accuracies": 1.0, "rewards/chosen": 4.291558265686035, "rewards/margins": 0.39186906814575195, "rewards/rejected": 3.899689197540283, "step": 7961 }, { "epoch": 1.76, "learning_rate": 3.659929476759405e-07, "logits/chosen": -2.074117422103882, "logits/rejected": -2.1061084270477295, "logps/chosen": -77.71221160888672, "logps/rejected": -100.97915649414062, "loss": 0.1484, "rewards/accuracies": 1.0, "rewards/chosen": 4.632787227630615, "rewards/margins": 2.4310576915740967, "rewards/rejected": 2.2017295360565186, "step": 7962 }, { "epoch": 1.76, "learning_rate": 3.653201334054762e-07, "logits/chosen": -1.9206101894378662, "logits/rejected": -1.8524006605148315, "logps/chosen": -57.87261199951172, "logps/rejected": -46.321842193603516, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": 3.026036024093628, "rewards/margins": 2.111318588256836, "rewards/rejected": 0.9147174954414368, "step": 7963 }, { "epoch": 1.76, "learning_rate": 3.6464791468023175e-07, "logits/chosen": -2.314345359802246, "logits/rejected": -2.277696132659912, "logps/chosen": -79.09170532226562, "logps/rejected": -48.762779235839844, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": 5.437231540679932, "rewards/margins": 3.524771213531494, "rewards/rejected": 1.9124603271484375, "step": 7964 }, { "epoch": 1.76, "learning_rate": 3.6397629158658853e-07, "logits/chosen": -1.928312063217163, "logits/rejected": -1.7964907884597778, "logps/chosen": -42.21329116821289, "logps/rejected": -20.323078155517578, "loss": 0.0715, "rewards/accuracies": 1.0, "rewards/chosen": 4.251779079437256, "rewards/margins": 3.5273780822753906, "rewards/rejected": 0.7244009375572205, "step": 7965 }, { "epoch": 1.76, "learning_rate": 3.63305264210847e-07, "logits/chosen": -2.0383920669555664, "logits/rejected": -1.9588112831115723, "logps/chosen": -75.53630065917969, "logps/rejected": -47.6670036315918, "loss": 0.189, "rewards/accuracies": 1.0, "rewards/chosen": 4.19629430770874, "rewards/margins": 0.8657808303833008, "rewards/rejected": 3.3305134773254395, "step": 7966 }, { "epoch": 1.76, "learning_rate": 3.6263483263923207e-07, "logits/chosen": -1.6710795164108276, "logits/rejected": -1.6395905017852783, "logps/chosen": -33.09629821777344, "logps/rejected": -24.89832305908203, "loss": 0.142, "rewards/accuracies": 1.0, "rewards/chosen": 4.329638957977295, "rewards/margins": 1.193272590637207, "rewards/rejected": 3.136366367340088, "step": 7967 }, { "epoch": 1.76, "learning_rate": 3.619649969578931e-07, "logits/chosen": -1.6794044971466064, "logits/rejected": -1.6669659614562988, "logps/chosen": -46.365394592285156, "logps/rejected": -40.11302185058594, "loss": 0.1644, "rewards/accuracies": 1.0, "rewards/chosen": 4.507709503173828, "rewards/margins": 0.9684593677520752, "rewards/rejected": 3.539250135421753, "step": 7968 }, { "epoch": 1.76, "learning_rate": 3.61295757252903e-07, "logits/chosen": -2.0395469665527344, "logits/rejected": -1.9732341766357422, "logps/chosen": -36.56897735595703, "logps/rejected": -58.175621032714844, "loss": 0.7108, "rewards/accuracies": 0.0, "rewards/chosen": 5.3020219802856445, "rewards/margins": -1.1364974975585938, "rewards/rejected": 6.438519477844238, "step": 7969 }, { "epoch": 1.76, "learning_rate": 3.6062711361025683e-07, "logits/chosen": -2.192908763885498, "logits/rejected": -2.2053403854370117, "logps/chosen": -74.4427490234375, "logps/rejected": -86.1172103881836, "loss": 0.3138, "rewards/accuracies": 1.0, "rewards/chosen": 10.145491600036621, "rewards/margins": 0.1926126480102539, "rewards/rejected": 9.952878952026367, "step": 7970 }, { "epoch": 1.76, "learning_rate": 3.5995906611587363e-07, "logits/chosen": -2.033924102783203, "logits/rejected": -2.0220885276794434, "logps/chosen": -31.50372314453125, "logps/rejected": -75.92898559570312, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 3.8797593116760254, "rewards/margins": 0.6372025012969971, "rewards/rejected": 3.2425568103790283, "step": 7971 }, { "epoch": 1.76, "learning_rate": 3.5929161485559694e-07, "logits/chosen": -1.8193687200546265, "logits/rejected": -1.6270201206207275, "logps/chosen": -45.672821044921875, "logps/rejected": -25.627185821533203, "loss": 0.0912, "rewards/accuracies": 1.0, "rewards/chosen": 4.352177619934082, "rewards/margins": 1.6290771961212158, "rewards/rejected": 2.723100423812866, "step": 7972 }, { "epoch": 1.76, "learning_rate": 3.5862475991519043e-07, "logits/chosen": -2.017700672149658, "logits/rejected": -2.017700672149658, "logps/chosen": -67.38714599609375, "logps/rejected": -67.38714599609375, "loss": 0.4428, "rewards/accuracies": 0.0, "rewards/chosen": 6.688000679016113, "rewards/margins": 0.0, "rewards/rejected": 6.688000679016113, "step": 7973 }, { "epoch": 1.76, "learning_rate": 3.5795850138034604e-07, "logits/chosen": -1.9632327556610107, "logits/rejected": -1.4177428483963013, "logps/chosen": -59.44932174682617, "logps/rejected": -80.03578186035156, "loss": 0.26, "rewards/accuracies": 1.0, "rewards/chosen": 4.08677339553833, "rewards/margins": 0.4259319305419922, "rewards/rejected": 3.660841464996338, "step": 7974 }, { "epoch": 1.77, "learning_rate": 3.572928393366731e-07, "logits/chosen": -1.9496976137161255, "logits/rejected": -1.9845750331878662, "logps/chosen": -45.04845428466797, "logps/rejected": -153.2576446533203, "loss": 1.091, "rewards/accuracies": 0.0, "rewards/chosen": 4.949052333831787, "rewards/margins": -1.8106918334960938, "rewards/rejected": 6.759744167327881, "step": 7975 }, { "epoch": 1.77, "learning_rate": 3.5662777386971157e-07, "logits/chosen": -1.7325613498687744, "logits/rejected": -1.7751048803329468, "logps/chosen": -28.082691192626953, "logps/rejected": -122.70158386230469, "loss": 0.7844, "rewards/accuracies": 0.0, "rewards/chosen": 4.585845470428467, "rewards/margins": -0.8259634971618652, "rewards/rejected": 5.411808967590332, "step": 7976 }, { "epoch": 1.77, "learning_rate": 3.559633050649181e-07, "logits/chosen": -1.7795805931091309, "logits/rejected": -1.7651885747909546, "logps/chosen": -18.618446350097656, "logps/rejected": -48.00065612792969, "loss": 0.6703, "rewards/accuracies": 1.0, "rewards/chosen": 2.5104851722717285, "rewards/margins": 0.5298279523849487, "rewards/rejected": 1.9806572198867798, "step": 7977 }, { "epoch": 1.77, "learning_rate": 3.552994330076759e-07, "logits/chosen": -2.1088595390319824, "logits/rejected": -2.0739970207214355, "logps/chosen": -40.0316162109375, "logps/rejected": -40.711090087890625, "loss": 0.3278, "rewards/accuracies": 1.0, "rewards/chosen": 3.4635238647460938, "rewards/margins": 0.47928762435913086, "rewards/rejected": 2.984236240386963, "step": 7978 }, { "epoch": 1.77, "learning_rate": 3.5463615778329074e-07, "logits/chosen": -2.116677761077881, "logits/rejected": -2.149928092956543, "logps/chosen": -81.3646240234375, "logps/rejected": -107.31117248535156, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 10.652010917663574, "rewards/margins": 2.18597412109375, "rewards/rejected": 8.466036796569824, "step": 7979 }, { "epoch": 1.77, "learning_rate": 3.539734794769922e-07, "logits/chosen": -1.8938053846359253, "logits/rejected": -1.7470383644104004, "logps/chosen": -55.55548095703125, "logps/rejected": -20.006481170654297, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 3.907078504562378, "rewards/margins": 3.513068914413452, "rewards/rejected": 0.3940095901489258, "step": 7980 }, { "epoch": 1.77, "learning_rate": 3.533113981739339e-07, "logits/chosen": -1.8901846408843994, "logits/rejected": -1.8217822313308716, "logps/chosen": -96.05609130859375, "logps/rejected": -50.848777770996094, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 4.9094557762146, "rewards/margins": 2.57578182220459, "rewards/rejected": 2.3336739540100098, "step": 7981 }, { "epoch": 1.77, "learning_rate": 3.526499139591888e-07, "logits/chosen": -1.770225167274475, "logits/rejected": -1.750778079032898, "logps/chosen": -97.67732238769531, "logps/rejected": -53.51076126098633, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 7.146583557128906, "rewards/margins": 4.8485636711120605, "rewards/rejected": 2.2980198860168457, "step": 7982 }, { "epoch": 1.77, "learning_rate": 3.519890269177595e-07, "logits/chosen": -1.9310592412948608, "logits/rejected": -1.9364087581634521, "logps/chosen": -52.78573989868164, "logps/rejected": -84.53681182861328, "loss": 0.0848, "rewards/accuracies": 1.0, "rewards/chosen": 4.5324931144714355, "rewards/margins": 1.8640780448913574, "rewards/rejected": 2.668415069580078, "step": 7983 }, { "epoch": 1.77, "learning_rate": 3.513287371345653e-07, "logits/chosen": -2.141610860824585, "logits/rejected": -2.1162896156311035, "logps/chosen": -53.6940803527832, "logps/rejected": -50.07466125488281, "loss": 0.1878, "rewards/accuracies": 1.0, "rewards/chosen": 5.307825088500977, "rewards/margins": 0.7880678176879883, "rewards/rejected": 4.519757270812988, "step": 7984 }, { "epoch": 1.77, "learning_rate": 3.506690446944544e-07, "logits/chosen": -2.1523971557617188, "logits/rejected": -1.9789092540740967, "logps/chosen": -90.09259796142578, "logps/rejected": -24.559478759765625, "loss": 0.3506, "rewards/accuracies": 1.0, "rewards/chosen": 5.7851691246032715, "rewards/margins": 4.808380126953125, "rewards/rejected": 0.976789116859436, "step": 7985 }, { "epoch": 1.77, "learning_rate": 3.5000994968219406e-07, "logits/chosen": -1.5162017345428467, "logits/rejected": -1.4712342023849487, "logps/chosen": -30.487903594970703, "logps/rejected": -35.290340423583984, "loss": 0.2869, "rewards/accuracies": 1.0, "rewards/chosen": 2.5341782569885254, "rewards/margins": 0.28995823860168457, "rewards/rejected": 2.244220018386841, "step": 7986 }, { "epoch": 1.77, "learning_rate": 3.49351452182477e-07, "logits/chosen": -2.05552077293396, "logits/rejected": -1.969045877456665, "logps/chosen": -103.23062896728516, "logps/rejected": -85.39120483398438, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": 5.536479949951172, "rewards/margins": 3.019310712814331, "rewards/rejected": 2.517169237136841, "step": 7987 }, { "epoch": 1.77, "learning_rate": 3.486935522799184e-07, "logits/chosen": -1.9489243030548096, "logits/rejected": -1.9220176935195923, "logps/chosen": -130.02493286132812, "logps/rejected": -126.18817138671875, "loss": 0.1237, "rewards/accuracies": 1.0, "rewards/chosen": 9.780146598815918, "rewards/margins": 1.2862167358398438, "rewards/rejected": 8.493929862976074, "step": 7988 }, { "epoch": 1.77, "learning_rate": 3.480362500590573e-07, "logits/chosen": -2.125051498413086, "logits/rejected": -2.0631983280181885, "logps/chosen": -36.081642150878906, "logps/rejected": -38.66980743408203, "loss": 0.7306, "rewards/accuracies": 1.0, "rewards/chosen": 3.0879998207092285, "rewards/margins": 1.0155773162841797, "rewards/rejected": 2.072422504425049, "step": 7989 }, { "epoch": 1.77, "learning_rate": 3.4737954560435496e-07, "logits/chosen": -1.9401030540466309, "logits/rejected": -1.4428579807281494, "logps/chosen": -57.730918884277344, "logps/rejected": -86.36959838867188, "loss": 0.3865, "rewards/accuracies": 1.0, "rewards/chosen": 4.157692909240723, "rewards/margins": 0.026699066162109375, "rewards/rejected": 4.130993843078613, "step": 7990 }, { "epoch": 1.77, "learning_rate": 3.4672343900019556e-07, "logits/chosen": -2.2659826278686523, "logits/rejected": -2.2642316818237305, "logps/chosen": -63.0412483215332, "logps/rejected": -137.05703735351562, "loss": 1.0056, "rewards/accuracies": 0.0, "rewards/chosen": 8.117952346801758, "rewards/margins": -1.8395967483520508, "rewards/rejected": 9.957549095153809, "step": 7991 }, { "epoch": 1.77, "learning_rate": 3.460679303308895e-07, "logits/chosen": -1.9822113513946533, "logits/rejected": -1.9714369773864746, "logps/chosen": -85.82804870605469, "logps/rejected": -65.47737884521484, "loss": 0.28, "rewards/accuracies": 1.0, "rewards/chosen": 8.073783874511719, "rewards/margins": 0.5233421325683594, "rewards/rejected": 7.550441741943359, "step": 7992 }, { "epoch": 1.77, "learning_rate": 3.454130196806649e-07, "logits/chosen": -1.9949671030044556, "logits/rejected": -1.9949671030044556, "logps/chosen": -26.739500045776367, "logps/rejected": -26.739500045776367, "loss": 0.9799, "rewards/accuracies": 0.0, "rewards/chosen": 6.320807933807373, "rewards/margins": 0.0, "rewards/rejected": 6.320807933807373, "step": 7993 }, { "epoch": 1.77, "learning_rate": 3.447587071336794e-07, "logits/chosen": -2.005401849746704, "logits/rejected": -2.0407652854919434, "logps/chosen": -120.75216674804688, "logps/rejected": -61.1065673828125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 8.81602668762207, "rewards/margins": 6.329492568969727, "rewards/rejected": 2.4865341186523438, "step": 7994 }, { "epoch": 1.77, "learning_rate": 3.4410499277400867e-07, "logits/chosen": -2.3314785957336426, "logits/rejected": -2.2601699829101562, "logps/chosen": -78.19656372070312, "logps/rejected": -33.396026611328125, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 6.616392612457275, "rewards/margins": 6.456878662109375, "rewards/rejected": 0.15951386094093323, "step": 7995 }, { "epoch": 1.77, "learning_rate": 3.4345187668565427e-07, "logits/chosen": -1.9244651794433594, "logits/rejected": -1.9244651794433594, "logps/chosen": -40.60485076904297, "logps/rejected": -40.60485076904297, "loss": 2.213, "rewards/accuracies": 0.0, "rewards/chosen": 3.8593223094940186, "rewards/margins": 0.0, "rewards/rejected": 3.8593223094940186, "step": 7996 }, { "epoch": 1.77, "learning_rate": 3.4279935895253966e-07, "logits/chosen": -1.459779977798462, "logits/rejected": -1.459779977798462, "logps/chosen": -29.93838119506836, "logps/rejected": -29.93838119506836, "loss": 0.3573, "rewards/accuracies": 0.0, "rewards/chosen": 2.1247432231903076, "rewards/margins": 0.0, "rewards/rejected": 2.1247432231903076, "step": 7997 }, { "epoch": 1.77, "learning_rate": 3.4214743965851063e-07, "logits/chosen": -2.1956140995025635, "logits/rejected": -2.1593177318573, "logps/chosen": -57.18931198120117, "logps/rejected": -80.8665542602539, "loss": 0.0813, "rewards/accuracies": 1.0, "rewards/chosen": 4.260287761688232, "rewards/margins": 2.791133403778076, "rewards/rejected": 1.4691543579101562, "step": 7998 }, { "epoch": 1.77, "learning_rate": 3.4149611888734e-07, "logits/chosen": -2.309468984603882, "logits/rejected": -2.306067943572998, "logps/chosen": -83.2894287109375, "logps/rejected": -84.35115814208984, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": 9.082880973815918, "rewards/margins": 4.012918472290039, "rewards/rejected": 5.069962501525879, "step": 7999 }, { "epoch": 1.77, "learning_rate": 3.4084539672271764e-07, "logits/chosen": -1.8629533052444458, "logits/rejected": -1.8538116216659546, "logps/chosen": -28.838796615600586, "logps/rejected": -23.431541442871094, "loss": 0.254, "rewards/accuracies": 1.0, "rewards/chosen": 2.2550947666168213, "rewards/margins": 0.4216676950454712, "rewards/rejected": 1.83342707157135, "step": 8000 }, { "epoch": 1.77, "learning_rate": 3.401952732482633e-07, "logits/chosen": -1.8923221826553345, "logits/rejected": -1.8120942115783691, "logps/chosen": -44.064144134521484, "logps/rejected": -78.86772155761719, "loss": 0.165, "rewards/accuracies": 1.0, "rewards/chosen": 4.852983474731445, "rewards/margins": 2.174194574356079, "rewards/rejected": 2.678788900375366, "step": 8001 }, { "epoch": 1.77, "learning_rate": 3.3954574854751286e-07, "logits/chosen": -1.9368048906326294, "logits/rejected": -1.848719835281372, "logps/chosen": -82.57282257080078, "logps/rejected": -106.991943359375, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 8.24142074584961, "rewards/margins": 2.5426597595214844, "rewards/rejected": 5.698760986328125, "step": 8002 }, { "epoch": 1.77, "learning_rate": 3.388968227039319e-07, "logits/chosen": -1.8976861238479614, "logits/rejected": -1.8550992012023926, "logps/chosen": -23.957698822021484, "logps/rejected": -57.231285095214844, "loss": 0.492, "rewards/accuracies": 0.0, "rewards/chosen": 1.4027481079101562, "rewards/margins": -0.46325767040252686, "rewards/rejected": 1.866005778312683, "step": 8003 }, { "epoch": 1.77, "learning_rate": 3.382484958009036e-07, "logits/chosen": -1.6919723749160767, "logits/rejected": -1.667267918586731, "logps/chosen": -34.59745788574219, "logps/rejected": -37.42197799682617, "loss": 0.4869, "rewards/accuracies": 1.0, "rewards/chosen": 3.010382890701294, "rewards/margins": 1.522043228149414, "rewards/rejected": 1.4883396625518799, "step": 8004 }, { "epoch": 1.77, "learning_rate": 3.376007679217369e-07, "logits/chosen": -1.8786267042160034, "logits/rejected": -1.9345914125442505, "logps/chosen": -78.92320251464844, "logps/rejected": -114.09416198730469, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 11.052484512329102, "rewards/margins": 3.1570940017700195, "rewards/rejected": 7.895390510559082, "step": 8005 }, { "epoch": 1.77, "learning_rate": 3.369536391496631e-07, "logits/chosen": -2.0220398902893066, "logits/rejected": -1.9987127780914307, "logps/chosen": -35.532142639160156, "logps/rejected": -46.96450424194336, "loss": 0.456, "rewards/accuracies": 0.0, "rewards/chosen": 3.1535117626190186, "rewards/margins": -0.2769155502319336, "rewards/rejected": 3.430427312850952, "step": 8006 }, { "epoch": 1.77, "learning_rate": 3.363071095678377e-07, "logits/chosen": -1.6982340812683105, "logits/rejected": -1.6982340812683105, "logps/chosen": -30.287433624267578, "logps/rejected": -30.287433624267578, "loss": 0.347, "rewards/accuracies": 0.0, "rewards/chosen": 1.989834189414978, "rewards/margins": 0.0, "rewards/rejected": 1.989834189414978, "step": 8007 }, { "epoch": 1.77, "learning_rate": 3.3566117925933784e-07, "logits/chosen": -2.0209054946899414, "logits/rejected": -1.9912786483764648, "logps/chosen": -52.054603576660156, "logps/rejected": -70.89299011230469, "loss": 0.1132, "rewards/accuracies": 1.0, "rewards/chosen": 3.765523672103882, "rewards/margins": 1.637326955795288, "rewards/rejected": 2.1281967163085938, "step": 8008 }, { "epoch": 1.77, "learning_rate": 3.3501584830716196e-07, "logits/chosen": -1.766097903251648, "logits/rejected": -1.6436716318130493, "logps/chosen": -147.46994018554688, "logps/rejected": -106.14630126953125, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": 8.549674987792969, "rewards/margins": 5.004101753234863, "rewards/rejected": 3.5455734729766846, "step": 8009 }, { "epoch": 1.77, "learning_rate": 3.343711167942371e-07, "logits/chosen": -1.7910469770431519, "logits/rejected": -1.7958176136016846, "logps/chosen": -42.3594856262207, "logps/rejected": -39.39910125732422, "loss": 0.1481, "rewards/accuracies": 1.0, "rewards/chosen": 3.697023391723633, "rewards/margins": 1.3129000663757324, "rewards/rejected": 2.3841233253479004, "step": 8010 }, { "epoch": 1.77, "learning_rate": 3.337269848034064e-07, "logits/chosen": -1.7110382318496704, "logits/rejected": -1.3870121240615845, "logps/chosen": -25.45142364501953, "logps/rejected": -42.95331573486328, "loss": 0.0832, "rewards/accuracies": 1.0, "rewards/chosen": 3.6086983680725098, "rewards/margins": 1.9480644464492798, "rewards/rejected": 1.66063392162323, "step": 8011 }, { "epoch": 1.77, "learning_rate": 3.33083452417442e-07, "logits/chosen": -1.719274878501892, "logits/rejected": -1.6932823657989502, "logps/chosen": -25.63817024230957, "logps/rejected": -48.13706970214844, "loss": 0.2782, "rewards/accuracies": 1.0, "rewards/chosen": 2.4804015159606934, "rewards/margins": 0.6728148460388184, "rewards/rejected": 1.807586669921875, "step": 8012 }, { "epoch": 1.77, "learning_rate": 3.3244051971903447e-07, "logits/chosen": -2.000887393951416, "logits/rejected": -2.0321247577667236, "logps/chosen": -34.780029296875, "logps/rejected": -64.49163818359375, "loss": 0.5643, "rewards/accuracies": 0.0, "rewards/chosen": 3.2556464672088623, "rewards/margins": -0.19011402130126953, "rewards/rejected": 3.445760488510132, "step": 8013 }, { "epoch": 1.77, "learning_rate": 3.3179818679079936e-07, "logits/chosen": -1.7431262731552124, "logits/rejected": -1.6782984733581543, "logps/chosen": -26.6925048828125, "logps/rejected": -39.145164489746094, "loss": 0.1854, "rewards/accuracies": 1.0, "rewards/chosen": 2.024047613143921, "rewards/margins": 0.8131833076477051, "rewards/rejected": 1.2108643054962158, "step": 8014 }, { "epoch": 1.77, "learning_rate": 3.311564537152756e-07, "logits/chosen": -2.0397636890411377, "logits/rejected": -1.9690369367599487, "logps/chosen": -141.9890899658203, "logps/rejected": -35.03523254394531, "loss": 0.0634, "rewards/accuracies": 1.0, "rewards/chosen": 6.835493564605713, "rewards/margins": 3.846459150314331, "rewards/rejected": 2.989034414291382, "step": 8015 }, { "epoch": 1.77, "learning_rate": 3.305153205749234e-07, "logits/chosen": -1.9416532516479492, "logits/rejected": -1.9572019577026367, "logps/chosen": -63.63092041015625, "logps/rejected": -136.47537231445312, "loss": 0.3128, "rewards/accuracies": 1.0, "rewards/chosen": 7.216042995452881, "rewards/margins": 0.2519865036010742, "rewards/rejected": 6.964056491851807, "step": 8016 }, { "epoch": 1.77, "learning_rate": 3.2987478745212853e-07, "logits/chosen": -2.2072291374206543, "logits/rejected": -2.2339046001434326, "logps/chosen": -161.4096221923828, "logps/rejected": -119.08367919921875, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 11.985758781433105, "rewards/margins": 3.5323448181152344, "rewards/rejected": 8.453413963317871, "step": 8017 }, { "epoch": 1.77, "learning_rate": 3.292348544291957e-07, "logits/chosen": -1.8534005880355835, "logits/rejected": -1.8594657182693481, "logps/chosen": -57.76942443847656, "logps/rejected": -80.55420684814453, "loss": 0.5983, "rewards/accuracies": 0.0, "rewards/chosen": 6.412837505340576, "rewards/margins": -0.7948689460754395, "rewards/rejected": 7.207706451416016, "step": 8018 }, { "epoch": 1.77, "learning_rate": 3.2859552158835684e-07, "logits/chosen": -1.7875452041625977, "logits/rejected": -1.7635689973831177, "logps/chosen": -69.35628509521484, "logps/rejected": -74.1885986328125, "loss": 0.1564, "rewards/accuracies": 1.0, "rewards/chosen": 3.0073211193084717, "rewards/margins": 2.558972120285034, "rewards/rejected": 0.4483489990234375, "step": 8019 }, { "epoch": 1.78, "learning_rate": 3.27956789011763e-07, "logits/chosen": -2.021803617477417, "logits/rejected": -1.916959285736084, "logps/chosen": -83.42736053466797, "logps/rejected": -23.762348175048828, "loss": 0.1131, "rewards/accuracies": 1.0, "rewards/chosen": 2.631671190261841, "rewards/margins": 1.7610149383544922, "rewards/rejected": 0.8706561923027039, "step": 8020 }, { "epoch": 1.78, "learning_rate": 3.2731865678149064e-07, "logits/chosen": -2.295755624771118, "logits/rejected": -1.851245641708374, "logps/chosen": -76.38545227050781, "logps/rejected": -52.34865951538086, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 5.393582344055176, "rewards/margins": 2.9716367721557617, "rewards/rejected": 2.421945571899414, "step": 8021 }, { "epoch": 1.78, "learning_rate": 3.2668112497953766e-07, "logits/chosen": -1.814794898033142, "logits/rejected": -1.7329795360565186, "logps/chosen": -95.01742553710938, "logps/rejected": -92.02220153808594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 10.002275466918945, "rewards/margins": 7.824704170227051, "rewards/rejected": 2.1775710582733154, "step": 8022 }, { "epoch": 1.78, "learning_rate": 3.2604419368782616e-07, "logits/chosen": -1.8371427059173584, "logits/rejected": -1.7956860065460205, "logps/chosen": -46.95178985595703, "logps/rejected": -55.09297561645508, "loss": 0.322, "rewards/accuracies": 1.0, "rewards/chosen": 4.379441261291504, "rewards/margins": 0.2652754783630371, "rewards/rejected": 4.114165782928467, "step": 8023 }, { "epoch": 1.78, "learning_rate": 3.254078629881996e-07, "logits/chosen": -1.8831863403320312, "logits/rejected": -1.8311847448349, "logps/chosen": -57.2440071105957, "logps/rejected": -41.010284423828125, "loss": 0.1228, "rewards/accuracies": 1.0, "rewards/chosen": 3.645780563354492, "rewards/margins": 1.8732738494873047, "rewards/rejected": 1.7725067138671875, "step": 8024 }, { "epoch": 1.78, "learning_rate": 3.247721329624254e-07, "logits/chosen": -2.0176448822021484, "logits/rejected": -1.987200379371643, "logps/chosen": -111.03412628173828, "logps/rejected": -64.45765686035156, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 6.288669586181641, "rewards/margins": 3.5474448204040527, "rewards/rejected": 2.741224765777588, "step": 8025 }, { "epoch": 1.78, "learning_rate": 3.241370036921937e-07, "logits/chosen": -1.9080039262771606, "logits/rejected": -1.9080039262771606, "logps/chosen": -30.56989288330078, "logps/rejected": -30.56989288330078, "loss": 0.6586, "rewards/accuracies": 0.0, "rewards/chosen": 7.5868425369262695, "rewards/margins": 0.0, "rewards/rejected": 7.5868425369262695, "step": 8026 }, { "epoch": 1.78, "learning_rate": 3.2350247525911594e-07, "logits/chosen": -1.894740104675293, "logits/rejected": -1.8649652004241943, "logps/chosen": -95.58720397949219, "logps/rejected": -110.6332015991211, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": 7.642985820770264, "rewards/margins": 4.14765739440918, "rewards/rejected": 3.495328664779663, "step": 8027 }, { "epoch": 1.78, "learning_rate": 3.228685477447291e-07, "logits/chosen": -2.0778980255126953, "logits/rejected": -2.018479347229004, "logps/chosen": -90.14484405517578, "logps/rejected": -59.0782470703125, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 7.374776363372803, "rewards/margins": 3.790750741958618, "rewards/rejected": 3.5840256214141846, "step": 8028 }, { "epoch": 1.78, "learning_rate": 3.2223522123048913e-07, "logits/chosen": -2.274259567260742, "logits/rejected": -2.298633098602295, "logps/chosen": -45.212364196777344, "logps/rejected": -136.57908630371094, "loss": 0.0429, "rewards/accuracies": 1.0, "rewards/chosen": 7.351058483123779, "rewards/margins": 2.654876232147217, "rewards/rejected": 4.6961822509765625, "step": 8029 }, { "epoch": 1.78, "learning_rate": 3.216024957977787e-07, "logits/chosen": -1.9214576482772827, "logits/rejected": -1.9077754020690918, "logps/chosen": -57.012290954589844, "logps/rejected": -86.14508819580078, "loss": 0.1596, "rewards/accuracies": 1.0, "rewards/chosen": 6.327291011810303, "rewards/margins": 1.3610715866088867, "rewards/rejected": 4.966219425201416, "step": 8030 }, { "epoch": 1.78, "learning_rate": 3.209703715279011e-07, "logits/chosen": -1.934675693511963, "logits/rejected": -1.9237818717956543, "logps/chosen": -69.32801818847656, "logps/rejected": -46.495018005371094, "loss": 0.6473, "rewards/accuracies": 0.0, "rewards/chosen": 1.9332268238067627, "rewards/margins": -0.891425371170044, "rewards/rejected": 2.8246521949768066, "step": 8031 }, { "epoch": 1.78, "learning_rate": 3.203388485020825e-07, "logits/chosen": -1.7370880842208862, "logits/rejected": -1.7005088329315186, "logps/chosen": -38.76171875, "logps/rejected": -66.69058227539062, "loss": 0.1647, "rewards/accuracies": 1.0, "rewards/chosen": 3.915930986404419, "rewards/margins": 0.9470100402832031, "rewards/rejected": 2.968920946121216, "step": 8032 }, { "epoch": 1.78, "learning_rate": 3.197079268014719e-07, "logits/chosen": -1.8389146327972412, "logits/rejected": -1.835699200630188, "logps/chosen": -61.201515197753906, "logps/rejected": -37.893829345703125, "loss": 0.5525, "rewards/accuracies": 1.0, "rewards/chosen": 3.200108289718628, "rewards/margins": 0.42723751068115234, "rewards/rejected": 2.7728707790374756, "step": 8033 }, { "epoch": 1.78, "learning_rate": 3.1907760650714215e-07, "logits/chosen": -1.688024878501892, "logits/rejected": -1.6046438217163086, "logps/chosen": -29.37941551208496, "logps/rejected": -10.8626708984375, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 3.775136709213257, "rewards/margins": 2.8639585971832275, "rewards/rejected": 0.9111780524253845, "step": 8034 }, { "epoch": 1.78, "learning_rate": 3.184478877000868e-07, "logits/chosen": -1.9866571426391602, "logits/rejected": -1.894184947013855, "logps/chosen": -55.391754150390625, "logps/rejected": -9.610565185546875, "loss": 0.5802, "rewards/accuracies": 0.0, "rewards/chosen": 2.225658416748047, "rewards/margins": -0.7764461040496826, "rewards/rejected": 3.0021045207977295, "step": 8035 }, { "epoch": 1.78, "learning_rate": 3.1781877046122454e-07, "logits/chosen": -2.1302683353424072, "logits/rejected": -2.1044883728027344, "logps/chosen": -65.96837615966797, "logps/rejected": -31.419565200805664, "loss": 0.3684, "rewards/accuracies": 1.0, "rewards/chosen": 4.691695690155029, "rewards/margins": 1.4143037796020508, "rewards/rejected": 3.2773919105529785, "step": 8036 }, { "epoch": 1.78, "learning_rate": 3.17190254871394e-07, "logits/chosen": -1.7406233549118042, "logits/rejected": -1.719567894935608, "logps/chosen": -59.224853515625, "logps/rejected": -67.91252899169922, "loss": 0.4064, "rewards/accuracies": 1.0, "rewards/chosen": 3.42022705078125, "rewards/margins": 1.1872389316558838, "rewards/rejected": 2.232988119125366, "step": 8037 }, { "epoch": 1.78, "learning_rate": 3.165623410113594e-07, "logits/chosen": -1.847988486289978, "logits/rejected": -1.8236138820648193, "logps/chosen": -48.312015533447266, "logps/rejected": -52.164154052734375, "loss": 0.1071, "rewards/accuracies": 1.0, "rewards/chosen": 3.834855318069458, "rewards/margins": 1.46095609664917, "rewards/rejected": 2.373899221420288, "step": 8038 }, { "epoch": 1.78, "learning_rate": 3.159350289618046e-07, "logits/chosen": -2.1365909576416016, "logits/rejected": -2.1199309825897217, "logps/chosen": -29.91520118713379, "logps/rejected": -50.59644317626953, "loss": 1.1205, "rewards/accuracies": 1.0, "rewards/chosen": 3.8926796913146973, "rewards/margins": 0.20276951789855957, "rewards/rejected": 3.6899101734161377, "step": 8039 }, { "epoch": 1.78, "learning_rate": 3.1530831880333846e-07, "logits/chosen": -2.096567392349243, "logits/rejected": -1.5132074356079102, "logps/chosen": -84.23374938964844, "logps/rejected": -60.380157470703125, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": 6.627954006195068, "rewards/margins": 3.566469430923462, "rewards/rejected": 3.0614845752716064, "step": 8040 }, { "epoch": 1.78, "learning_rate": 3.146822106164915e-07, "logits/chosen": -2.1283419132232666, "logits/rejected": -1.8795539140701294, "logps/chosen": -31.941402435302734, "logps/rejected": -25.898717880249023, "loss": 0.6519, "rewards/accuracies": 1.0, "rewards/chosen": 3.87555193901062, "rewards/margins": 0.08223605155944824, "rewards/rejected": 3.793315887451172, "step": 8041 }, { "epoch": 1.78, "learning_rate": 3.140567044817172e-07, "logits/chosen": -1.8311365842819214, "logits/rejected": -1.807891607284546, "logps/chosen": -31.761096954345703, "logps/rejected": -57.11865997314453, "loss": 0.3645, "rewards/accuracies": 0.0, "rewards/chosen": 4.573916435241699, "rewards/margins": -0.042490482330322266, "rewards/rejected": 4.6164069175720215, "step": 8042 }, { "epoch": 1.78, "learning_rate": 3.1343180047939236e-07, "logits/chosen": -1.873287320137024, "logits/rejected": -1.870911717414856, "logps/chosen": -50.208778381347656, "logps/rejected": -49.51606369018555, "loss": 0.3269, "rewards/accuracies": 1.0, "rewards/chosen": 3.5880608558654785, "rewards/margins": 0.41934704780578613, "rewards/rejected": 3.1687138080596924, "step": 8043 }, { "epoch": 1.78, "learning_rate": 3.128074986898144e-07, "logits/chosen": -1.8030089139938354, "logits/rejected": -1.7670152187347412, "logps/chosen": -72.52656555175781, "logps/rejected": -81.19476318359375, "loss": 0.07, "rewards/accuracies": 1.0, "rewards/chosen": 4.422557353973389, "rewards/margins": 1.9614808559417725, "rewards/rejected": 2.461076498031616, "step": 8044 }, { "epoch": 1.78, "learning_rate": 3.121837991932053e-07, "logits/chosen": -1.6631107330322266, "logits/rejected": -1.6631107330322266, "logps/chosen": -47.14241409301758, "logps/rejected": -47.14241409301758, "loss": 0.8044, "rewards/accuracies": 0.0, "rewards/chosen": 2.2380168437957764, "rewards/margins": 0.0, "rewards/rejected": 2.2380168437957764, "step": 8045 }, { "epoch": 1.78, "learning_rate": 3.1156070206970866e-07, "logits/chosen": -2.0283117294311523, "logits/rejected": -2.0141921043395996, "logps/chosen": -48.497806549072266, "logps/rejected": -68.5504150390625, "loss": 0.0863, "rewards/accuracies": 1.0, "rewards/chosen": 3.0637173652648926, "rewards/margins": 2.2574613094329834, "rewards/rejected": 0.806256115436554, "step": 8046 }, { "epoch": 1.78, "learning_rate": 3.109382073993916e-07, "logits/chosen": -1.900152564048767, "logits/rejected": -1.8335682153701782, "logps/chosen": -70.45098876953125, "logps/rejected": -65.22982025146484, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": 4.959965705871582, "rewards/margins": 3.8969597816467285, "rewards/rejected": 1.063005805015564, "step": 8047 }, { "epoch": 1.78, "learning_rate": 3.1031631526224237e-07, "logits/chosen": -1.9756956100463867, "logits/rejected": -2.0093886852264404, "logps/chosen": -109.48631286621094, "logps/rejected": -102.3502197265625, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": 12.6449556350708, "rewards/margins": 3.317312240600586, "rewards/rejected": 9.327643394470215, "step": 8048 }, { "epoch": 1.78, "learning_rate": 3.096950257381731e-07, "logits/chosen": -2.0263140201568604, "logits/rejected": -2.03458833694458, "logps/chosen": -46.976280212402344, "logps/rejected": -82.4149169921875, "loss": 0.0742, "rewards/accuracies": 1.0, "rewards/chosen": 4.833321571350098, "rewards/margins": 2.1325769424438477, "rewards/rejected": 2.70074462890625, "step": 8049 }, { "epoch": 1.78, "learning_rate": 3.090743389070172e-07, "logits/chosen": -2.020681381225586, "logits/rejected": -2.0044400691986084, "logps/chosen": -54.89990234375, "logps/rejected": -40.281524658203125, "loss": 0.3957, "rewards/accuracies": 1.0, "rewards/chosen": 3.259286642074585, "rewards/margins": 0.20166492462158203, "rewards/rejected": 3.057621717453003, "step": 8050 }, { "epoch": 1.78, "learning_rate": 3.084542548485325e-07, "logits/chosen": -1.8413587808609009, "logits/rejected": -1.8208463191986084, "logps/chosen": -50.1068000793457, "logps/rejected": -62.33462142944336, "loss": 0.317, "rewards/accuracies": 1.0, "rewards/chosen": 5.609999656677246, "rewards/margins": 0.13503360748291016, "rewards/rejected": 5.474966049194336, "step": 8051 }, { "epoch": 1.78, "learning_rate": 3.078347736423981e-07, "logits/chosen": -2.018676519393921, "logits/rejected": -1.9753504991531372, "logps/chosen": -62.42196273803711, "logps/rejected": -32.87118148803711, "loss": 0.1917, "rewards/accuracies": 1.0, "rewards/chosen": 5.709274768829346, "rewards/margins": 1.6835684776306152, "rewards/rejected": 4.0257062911987305, "step": 8052 }, { "epoch": 1.78, "learning_rate": 3.072158953682153e-07, "logits/chosen": -1.7334052324295044, "logits/rejected": -1.754939317703247, "logps/chosen": -39.52630615234375, "logps/rejected": -31.77853775024414, "loss": 0.5425, "rewards/accuracies": 1.0, "rewards/chosen": 1.7472587823867798, "rewards/margins": 0.3637508153915405, "rewards/rejected": 1.3835079669952393, "step": 8053 }, { "epoch": 1.78, "learning_rate": 3.0659762010550875e-07, "logits/chosen": -2.014108419418335, "logits/rejected": -1.9748194217681885, "logps/chosen": -72.33570861816406, "logps/rejected": -49.059364318847656, "loss": 0.1633, "rewards/accuracies": 1.0, "rewards/chosen": 3.957443952560425, "rewards/margins": 1.0873382091522217, "rewards/rejected": 2.870105743408203, "step": 8054 }, { "epoch": 1.78, "learning_rate": 3.0597994793372556e-07, "logits/chosen": -1.9006991386413574, "logits/rejected": -1.8729974031448364, "logps/chosen": -106.4682846069336, "logps/rejected": -124.56501770019531, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 10.550444602966309, "rewards/margins": 2.9101157188415527, "rewards/rejected": 7.640328884124756, "step": 8055 }, { "epoch": 1.78, "learning_rate": 3.0536287893223603e-07, "logits/chosen": -1.9259159564971924, "logits/rejected": -1.8975677490234375, "logps/chosen": -56.612823486328125, "logps/rejected": -64.55461120605469, "loss": 0.1442, "rewards/accuracies": 1.0, "rewards/chosen": 4.467426300048828, "rewards/margins": 2.1178672313690186, "rewards/rejected": 2.3495590686798096, "step": 8056 }, { "epoch": 1.78, "learning_rate": 3.0474641318032906e-07, "logits/chosen": -2.111980676651001, "logits/rejected": -2.073395013809204, "logps/chosen": -98.13494873046875, "logps/rejected": -49.91212463378906, "loss": 0.0874, "rewards/accuracies": 1.0, "rewards/chosen": 6.508697509765625, "rewards/margins": 2.1137657165527344, "rewards/rejected": 4.394931793212891, "step": 8057 }, { "epoch": 1.78, "learning_rate": 3.041305507572223e-07, "logits/chosen": -1.7975555658340454, "logits/rejected": -1.841257095336914, "logps/chosen": -36.80644226074219, "logps/rejected": -68.1925277709961, "loss": 0.8437, "rewards/accuracies": 0.0, "rewards/chosen": 2.8220527172088623, "rewards/margins": -0.8263108730316162, "rewards/rejected": 3.6483635902404785, "step": 8058 }, { "epoch": 1.78, "learning_rate": 3.0351529174205086e-07, "logits/chosen": -1.8644554615020752, "logits/rejected": -1.8849397897720337, "logps/chosen": -88.21980285644531, "logps/rejected": -111.60842895507812, "loss": 0.0636, "rewards/accuracies": 1.0, "rewards/chosen": 7.7474045753479, "rewards/margins": 2.0341672897338867, "rewards/rejected": 5.713237285614014, "step": 8059 }, { "epoch": 1.78, "learning_rate": 3.0290063621387367e-07, "logits/chosen": -1.7905919551849365, "logits/rejected": -1.807200312614441, "logps/chosen": -23.906932830810547, "logps/rejected": -90.16069030761719, "loss": 0.4668, "rewards/accuracies": 0.0, "rewards/chosen": 3.8864293098449707, "rewards/margins": -0.23764324188232422, "rewards/rejected": 4.124072551727295, "step": 8060 }, { "epoch": 1.78, "learning_rate": 3.022865842516737e-07, "logits/chosen": -1.763110637664795, "logits/rejected": -1.7039954662322998, "logps/chosen": -26.785755157470703, "logps/rejected": -18.153560638427734, "loss": 0.3144, "rewards/accuracies": 1.0, "rewards/chosen": 4.237121105194092, "rewards/margins": 3.1435317993164062, "rewards/rejected": 1.093589186668396, "step": 8061 }, { "epoch": 1.78, "learning_rate": 3.0167313593435497e-07, "logits/chosen": -2.0463337898254395, "logits/rejected": -1.9996095895767212, "logps/chosen": -51.19203186035156, "logps/rejected": -17.407182693481445, "loss": 0.1553, "rewards/accuracies": 1.0, "rewards/chosen": 3.0967299938201904, "rewards/margins": 1.3019016981124878, "rewards/rejected": 1.7948282957077026, "step": 8062 }, { "epoch": 1.78, "learning_rate": 3.0106029134074343e-07, "logits/chosen": -2.1361083984375, "logits/rejected": -2.1718671321868896, "logps/chosen": -45.82203674316406, "logps/rejected": -122.90826416015625, "loss": 2.4602, "rewards/accuracies": 0.0, "rewards/chosen": 3.783212423324585, "rewards/margins": -4.598711967468262, "rewards/rejected": 8.381924629211426, "step": 8063 }, { "epoch": 1.78, "learning_rate": 3.004480505495888e-07, "logits/chosen": -1.7006869316101074, "logits/rejected": -1.535306692123413, "logps/chosen": -115.78944396972656, "logps/rejected": -96.38761901855469, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 5.497291564941406, "rewards/margins": 3.268092393875122, "rewards/rejected": 2.229199171066284, "step": 8064 }, { "epoch": 1.79, "learning_rate": 2.9983641363956263e-07, "logits/chosen": -1.8020333051681519, "logits/rejected": -1.7883739471435547, "logps/chosen": -26.549083709716797, "logps/rejected": -27.72437286376953, "loss": 0.2574, "rewards/accuracies": 1.0, "rewards/chosen": 4.244742393493652, "rewards/margins": 0.9070553779602051, "rewards/rejected": 3.3376870155334473, "step": 8065 }, { "epoch": 1.79, "learning_rate": 2.99225380689257e-07, "logits/chosen": -1.8545396327972412, "logits/rejected": -1.8157039880752563, "logps/chosen": -49.92289352416992, "logps/rejected": -35.57317352294922, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 7.629251003265381, "rewards/margins": 4.217800140380859, "rewards/rejected": 3.4114511013031006, "step": 8066 }, { "epoch": 1.79, "learning_rate": 2.9861495177719133e-07, "logits/chosen": -2.0351881980895996, "logits/rejected": -1.9133095741271973, "logps/chosen": -112.33158111572266, "logps/rejected": -48.18701171875, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": 6.345949649810791, "rewards/margins": 4.552982330322266, "rewards/rejected": 1.7929672002792358, "step": 8067 }, { "epoch": 1.79, "learning_rate": 2.980051269818013e-07, "logits/chosen": -1.878048062324524, "logits/rejected": -1.6299415826797485, "logps/chosen": -63.57442092895508, "logps/rejected": -48.4404296875, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 3.742051362991333, "rewards/margins": 2.0232348442077637, "rewards/rejected": 1.7188163995742798, "step": 8068 }, { "epoch": 1.79, "learning_rate": 2.9739590638144967e-07, "logits/chosen": -2.052920341491699, "logits/rejected": -2.0255448818206787, "logps/chosen": -52.76123809814453, "logps/rejected": -32.64613342285156, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 5.146984100341797, "rewards/margins": 2.9097397327423096, "rewards/rejected": 2.2372443675994873, "step": 8069 }, { "epoch": 1.79, "learning_rate": 2.967872900544194e-07, "logits/chosen": -1.9867979288101196, "logits/rejected": -1.9701310396194458, "logps/chosen": -59.91832733154297, "logps/rejected": -78.33915710449219, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 5.934131622314453, "rewards/margins": 0.5719046592712402, "rewards/rejected": 5.362226963043213, "step": 8070 }, { "epoch": 1.79, "learning_rate": 2.9617927807891575e-07, "logits/chosen": -2.141087532043457, "logits/rejected": -2.1963839530944824, "logps/chosen": -45.37105941772461, "logps/rejected": -96.00005340576172, "loss": 1.127, "rewards/accuracies": 0.0, "rewards/chosen": 5.68742036819458, "rewards/margins": -2.0990214347839355, "rewards/rejected": 7.786441802978516, "step": 8071 }, { "epoch": 1.79, "learning_rate": 2.955718705330679e-07, "logits/chosen": -2.098526954650879, "logits/rejected": -1.9507049322128296, "logps/chosen": -115.71669006347656, "logps/rejected": -73.037353515625, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 6.4591569900512695, "rewards/margins": 3.5086305141448975, "rewards/rejected": 2.950526475906372, "step": 8072 }, { "epoch": 1.79, "learning_rate": 2.949650674949245e-07, "logits/chosen": -1.9640989303588867, "logits/rejected": -1.896280288696289, "logps/chosen": -88.608154296875, "logps/rejected": -199.2384033203125, "loss": 0.1389, "rewards/accuracies": 1.0, "rewards/chosen": 7.842801094055176, "rewards/margins": 1.3712434768676758, "rewards/rejected": 6.4715576171875, "step": 8073 }, { "epoch": 1.79, "learning_rate": 2.9435886904246036e-07, "logits/chosen": -1.8004151582717896, "logits/rejected": -1.8004151582717896, "logps/chosen": -40.723506927490234, "logps/rejected": -40.723506927490234, "loss": 0.3506, "rewards/accuracies": 0.0, "rewards/chosen": 3.5518786907196045, "rewards/margins": 0.0, "rewards/rejected": 3.5518786907196045, "step": 8074 }, { "epoch": 1.79, "learning_rate": 2.937532752535688e-07, "logits/chosen": -2.0807855129241943, "logits/rejected": -2.082108974456787, "logps/chosen": -38.45831298828125, "logps/rejected": -50.8145866394043, "loss": 0.2968, "rewards/accuracies": 1.0, "rewards/chosen": 3.4320342540740967, "rewards/margins": 0.2130603790283203, "rewards/rejected": 3.2189738750457764, "step": 8075 }, { "epoch": 1.79, "learning_rate": 2.931482862060686e-07, "logits/chosen": -1.4813910722732544, "logits/rejected": -1.3108805418014526, "logps/chosen": -42.47113800048828, "logps/rejected": -3.3759734630584717, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 3.2438302040100098, "rewards/margins": 2.3890185356140137, "rewards/rejected": 0.8548116683959961, "step": 8076 }, { "epoch": 1.79, "learning_rate": 2.925439019776971e-07, "logits/chosen": -1.8706843852996826, "logits/rejected": -1.8706843852996826, "logps/chosen": -47.95390319824219, "logps/rejected": -47.95390319824219, "loss": 0.35, "rewards/accuracies": 0.0, "rewards/chosen": 4.980332851409912, "rewards/margins": 0.0, "rewards/rejected": 4.980332851409912, "step": 8077 }, { "epoch": 1.79, "learning_rate": 2.9194012264611993e-07, "logits/chosen": -1.9499622583389282, "logits/rejected": -1.9499622583389282, "logps/chosen": -35.3416633605957, "logps/rejected": -35.3416633605957, "loss": 0.3587, "rewards/accuracies": 0.0, "rewards/chosen": 3.439507007598877, "rewards/margins": 0.0, "rewards/rejected": 3.439507007598877, "step": 8078 }, { "epoch": 1.79, "learning_rate": 2.913369482889178e-07, "logits/chosen": -1.8065299987792969, "logits/rejected": -1.7374223470687866, "logps/chosen": -45.43272399902344, "logps/rejected": -47.16716766357422, "loss": 0.3788, "rewards/accuracies": 1.0, "rewards/chosen": 4.436283111572266, "rewards/margins": 0.7682356834411621, "rewards/rejected": 3.6680474281311035, "step": 8079 }, { "epoch": 1.79, "learning_rate": 2.907343789835987e-07, "logits/chosen": -1.9926011562347412, "logits/rejected": -1.9970735311508179, "logps/chosen": -72.58543395996094, "logps/rejected": -133.3050994873047, "loss": 0.0674, "rewards/accuracies": 1.0, "rewards/chosen": 8.004225730895996, "rewards/margins": 2.2312183380126953, "rewards/rejected": 5.773007392883301, "step": 8080 }, { "epoch": 1.79, "learning_rate": 2.901324148075918e-07, "logits/chosen": -1.8952924013137817, "logits/rejected": -1.9283368587493896, "logps/chosen": -36.80432891845703, "logps/rejected": -87.20884704589844, "loss": 0.2451, "rewards/accuracies": 1.0, "rewards/chosen": 4.320343971252441, "rewards/margins": 0.5125284194946289, "rewards/rejected": 3.8078155517578125, "step": 8081 }, { "epoch": 1.79, "learning_rate": 2.8953105583824634e-07, "logits/chosen": -2.272362232208252, "logits/rejected": -2.3165371417999268, "logps/chosen": -78.88506317138672, "logps/rejected": -146.6645050048828, "loss": 0.2381, "rewards/accuracies": 1.0, "rewards/chosen": 8.990833282470703, "rewards/margins": 0.6041345596313477, "rewards/rejected": 8.386698722839355, "step": 8082 }, { "epoch": 1.79, "learning_rate": 2.889303021528372e-07, "logits/chosen": -1.8471062183380127, "logits/rejected": -1.7086620330810547, "logps/chosen": -135.52493286132812, "logps/rejected": -13.382548332214355, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 6.105947971343994, "rewards/margins": 4.765255451202393, "rewards/rejected": 1.3406926393508911, "step": 8083 }, { "epoch": 1.79, "learning_rate": 2.883301538285582e-07, "logits/chosen": -2.0281991958618164, "logits/rejected": -2.0306460857391357, "logps/chosen": -31.725900650024414, "logps/rejected": -68.04167175292969, "loss": 1.1715, "rewards/accuracies": 0.0, "rewards/chosen": 2.849311590194702, "rewards/margins": -2.241032838821411, "rewards/rejected": 5.090344429016113, "step": 8084 }, { "epoch": 1.79, "learning_rate": 2.8773061094252986e-07, "logits/chosen": -1.8857173919677734, "logits/rejected": -1.8386077880859375, "logps/chosen": -94.9467544555664, "logps/rejected": -113.95491027832031, "loss": 0.1378, "rewards/accuracies": 1.0, "rewards/chosen": 8.730419158935547, "rewards/margins": 2.1220755577087402, "rewards/rejected": 6.608343601226807, "step": 8085 }, { "epoch": 1.79, "learning_rate": 2.871316735717877e-07, "logits/chosen": -1.892465353012085, "logits/rejected": -1.8541393280029297, "logps/chosen": -95.72125244140625, "logps/rejected": -55.53349685668945, "loss": 0.2538, "rewards/accuracies": 1.0, "rewards/chosen": 3.847808837890625, "rewards/margins": 1.0550227165222168, "rewards/rejected": 2.792786121368408, "step": 8086 }, { "epoch": 1.79, "learning_rate": 2.8653334179329803e-07, "logits/chosen": -2.0798990726470947, "logits/rejected": -1.9824626445770264, "logps/chosen": -97.11922454833984, "logps/rejected": -41.61280059814453, "loss": 0.3634, "rewards/accuracies": 1.0, "rewards/chosen": 8.72035026550293, "rewards/margins": 3.375328540802002, "rewards/rejected": 5.345021724700928, "step": 8087 }, { "epoch": 1.79, "learning_rate": 2.859356156839421e-07, "logits/chosen": -2.20670223236084, "logits/rejected": -2.1392953395843506, "logps/chosen": -81.47483825683594, "logps/rejected": -67.9832763671875, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 8.464424133300781, "rewards/margins": 5.556146144866943, "rewards/rejected": 2.908277988433838, "step": 8088 }, { "epoch": 1.79, "learning_rate": 2.8533849532052736e-07, "logits/chosen": -1.9723260402679443, "logits/rejected": -1.9754183292388916, "logps/chosen": -26.710765838623047, "logps/rejected": -85.85319519042969, "loss": 0.6384, "rewards/accuracies": 1.0, "rewards/chosen": 4.321823596954346, "rewards/margins": 1.1943867206573486, "rewards/rejected": 3.127436876296997, "step": 8089 }, { "epoch": 1.79, "learning_rate": 2.84741980779783e-07, "logits/chosen": -1.7808574438095093, "logits/rejected": -1.7886762619018555, "logps/chosen": -80.28863525390625, "logps/rejected": -78.20751190185547, "loss": 0.0839, "rewards/accuracies": 1.0, "rewards/chosen": 4.258243560791016, "rewards/margins": 2.141162157058716, "rewards/rejected": 2.1170814037323, "step": 8090 }, { "epoch": 1.79, "learning_rate": 2.841460721383571e-07, "logits/chosen": -1.8991185426712036, "logits/rejected": -1.930033564567566, "logps/chosen": -51.209774017333984, "logps/rejected": -106.93343353271484, "loss": 0.0873, "rewards/accuracies": 1.0, "rewards/chosen": 3.6569316387176514, "rewards/margins": 1.804925799369812, "rewards/rejected": 1.8520058393478394, "step": 8091 }, { "epoch": 1.79, "learning_rate": 2.8355076947282624e-07, "logits/chosen": -1.8874906301498413, "logits/rejected": -1.803301453590393, "logps/chosen": -56.7059326171875, "logps/rejected": -24.352872848510742, "loss": 0.1553, "rewards/accuracies": 1.0, "rewards/chosen": 3.2650649547576904, "rewards/margins": 1.1538975238800049, "rewards/rejected": 2.1111674308776855, "step": 8092 }, { "epoch": 1.79, "learning_rate": 2.82956072859682e-07, "logits/chosen": -1.880617380142212, "logits/rejected": -1.9446275234222412, "logps/chosen": -56.073570251464844, "logps/rejected": -122.29869842529297, "loss": 0.9794, "rewards/accuracies": 0.0, "rewards/chosen": 4.330922603607178, "rewards/margins": -1.7863068580627441, "rewards/rejected": 6.117229461669922, "step": 8093 }, { "epoch": 1.79, "learning_rate": 2.8236198237534427e-07, "logits/chosen": -2.208466053009033, "logits/rejected": -2.2153494358062744, "logps/chosen": -93.56183624267578, "logps/rejected": -36.022674560546875, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": 7.103798866271973, "rewards/margins": 2.130734920501709, "rewards/rejected": 4.973063945770264, "step": 8094 }, { "epoch": 1.79, "learning_rate": 2.817684980961499e-07, "logits/chosen": -1.92893385887146, "logits/rejected": -1.8306312561035156, "logps/chosen": -166.69300842285156, "logps/rejected": -56.40556335449219, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": 5.374513149261475, "rewards/margins": 2.7234914302825928, "rewards/rejected": 2.651021718978882, "step": 8095 }, { "epoch": 1.79, "learning_rate": 2.8117562009836173e-07, "logits/chosen": -2.170879364013672, "logits/rejected": -2.172032594680786, "logps/chosen": -68.67864990234375, "logps/rejected": -59.313079833984375, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 5.994046211242676, "rewards/margins": 1.6467041969299316, "rewards/rejected": 4.347342014312744, "step": 8096 }, { "epoch": 1.79, "learning_rate": 2.8058334845816214e-07, "logits/chosen": -2.0558595657348633, "logits/rejected": -2.0717203617095947, "logps/chosen": -36.81090545654297, "logps/rejected": -42.49284362792969, "loss": 1.058, "rewards/accuracies": 0.0, "rewards/chosen": 4.007427215576172, "rewards/margins": -1.9546747207641602, "rewards/rejected": 5.962101936340332, "step": 8097 }, { "epoch": 1.79, "learning_rate": 2.799916832516575e-07, "logits/chosen": -1.7285715341567993, "logits/rejected": -1.7285715341567993, "logps/chosen": -54.707733154296875, "logps/rejected": -54.707733154296875, "loss": 0.3497, "rewards/accuracies": 0.0, "rewards/chosen": 4.333848476409912, "rewards/margins": 0.0, "rewards/rejected": 4.333848476409912, "step": 8098 }, { "epoch": 1.79, "learning_rate": 2.7940062455487584e-07, "logits/chosen": -1.7162556648254395, "logits/rejected": -1.6067936420440674, "logps/chosen": -19.933305740356445, "logps/rejected": -10.348512649536133, "loss": 0.1883, "rewards/accuracies": 1.0, "rewards/chosen": 2.3010575771331787, "rewards/margins": 1.1304535865783691, "rewards/rejected": 1.1706039905548096, "step": 8099 }, { "epoch": 1.79, "learning_rate": 2.788101724437642e-07, "logits/chosen": -2.408442497253418, "logits/rejected": -2.427035331726074, "logps/chosen": -94.61677551269531, "logps/rejected": -115.7994384765625, "loss": 0.0753, "rewards/accuracies": 1.0, "rewards/chosen": 7.570765972137451, "rewards/margins": 3.6535112857818604, "rewards/rejected": 3.917254686355591, "step": 8100 }, { "epoch": 1.79, "learning_rate": 2.7822032699419743e-07, "logits/chosen": -1.9013090133666992, "logits/rejected": -1.7914339303970337, "logps/chosen": -75.7796630859375, "logps/rejected": -27.966413497924805, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": 4.922990322113037, "rewards/margins": 2.8401501178741455, "rewards/rejected": 2.0828402042388916, "step": 8101 }, { "epoch": 1.79, "learning_rate": 2.7763108828196605e-07, "logits/chosen": -1.9701464176177979, "logits/rejected": -1.9118283987045288, "logps/chosen": -84.16392517089844, "logps/rejected": -20.243759155273438, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 8.444465637207031, "rewards/margins": 7.001263618469238, "rewards/rejected": 1.443202257156372, "step": 8102 }, { "epoch": 1.79, "learning_rate": 2.770424563827895e-07, "logits/chosen": -2.002908706665039, "logits/rejected": -2.002908706665039, "logps/chosen": -71.39155578613281, "logps/rejected": -71.39155578613281, "loss": 0.6118, "rewards/accuracies": 0.0, "rewards/chosen": 7.824925422668457, "rewards/margins": 0.0, "rewards/rejected": 7.824925422668457, "step": 8103 }, { "epoch": 1.79, "learning_rate": 2.7645443137230274e-07, "logits/chosen": -2.0297420024871826, "logits/rejected": -1.8796457052230835, "logps/chosen": -78.05328369140625, "logps/rejected": -78.18547821044922, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 8.611337661743164, "rewards/margins": 2.480297565460205, "rewards/rejected": 6.131040096282959, "step": 8104 }, { "epoch": 1.79, "learning_rate": 2.7586701332606656e-07, "logits/chosen": -2.2219724655151367, "logits/rejected": -2.207770586013794, "logps/chosen": -63.668373107910156, "logps/rejected": -92.551025390625, "loss": 0.6517, "rewards/accuracies": 0.0, "rewards/chosen": 8.580327987670898, "rewards/margins": -0.9809741973876953, "rewards/rejected": 9.561302185058594, "step": 8105 }, { "epoch": 1.79, "learning_rate": 2.752802023195622e-07, "logits/chosen": -2.0466573238372803, "logits/rejected": -2.0099563598632812, "logps/chosen": -34.9832763671875, "logps/rejected": -75.93438720703125, "loss": 0.1467, "rewards/accuracies": 1.0, "rewards/chosen": 4.016822814941406, "rewards/margins": 1.2738196849822998, "rewards/rejected": 2.7430031299591064, "step": 8106 }, { "epoch": 1.79, "learning_rate": 2.7469399842819435e-07, "logits/chosen": -1.876004934310913, "logits/rejected": -1.9069703817367554, "logps/chosen": -22.958253860473633, "logps/rejected": -89.74130249023438, "loss": 0.2781, "rewards/accuracies": 1.0, "rewards/chosen": 4.050514221191406, "rewards/margins": 0.6847548484802246, "rewards/rejected": 3.3657593727111816, "step": 8107 }, { "epoch": 1.79, "learning_rate": 2.7410840172728825e-07, "logits/chosen": -2.0033557415008545, "logits/rejected": -2.0004494190216064, "logps/chosen": -41.744232177734375, "logps/rejected": -46.518638610839844, "loss": 0.379, "rewards/accuracies": 0.0, "rewards/chosen": 1.99431312084198, "rewards/margins": -0.03052365779876709, "rewards/rejected": 2.024836778640747, "step": 8108 }, { "epoch": 1.79, "learning_rate": 2.7352341229209213e-07, "logits/chosen": -2.0705037117004395, "logits/rejected": -2.0686817169189453, "logps/chosen": -20.43949317932129, "logps/rejected": -61.00351333618164, "loss": 1.4295, "rewards/accuracies": 0.0, "rewards/chosen": 2.5168561935424805, "rewards/margins": -0.6786527633666992, "rewards/rejected": 3.1955089569091797, "step": 8109 }, { "epoch": 1.8, "learning_rate": 2.7293903019777577e-07, "logits/chosen": -1.7643018960952759, "logits/rejected": -1.8575835227966309, "logps/chosen": -45.031089782714844, "logps/rejected": -147.63833618164062, "loss": 0.7949, "rewards/accuracies": 0.0, "rewards/chosen": 5.751297950744629, "rewards/margins": -1.2727165222167969, "rewards/rejected": 7.024014472961426, "step": 8110 }, { "epoch": 1.8, "learning_rate": 2.723552555194292e-07, "logits/chosen": -1.9511315822601318, "logits/rejected": -1.9594753980636597, "logps/chosen": -48.632911682128906, "logps/rejected": -52.71645736694336, "loss": 0.2731, "rewards/accuracies": 1.0, "rewards/chosen": 3.594594717025757, "rewards/margins": 0.42203259468078613, "rewards/rejected": 3.1725621223449707, "step": 8111 }, { "epoch": 1.8, "learning_rate": 2.717720883320685e-07, "logits/chosen": -2.0642993450164795, "logits/rejected": -1.96265709400177, "logps/chosen": -51.65019226074219, "logps/rejected": -43.87437438964844, "loss": 0.632, "rewards/accuracies": 1.0, "rewards/chosen": 6.010009765625, "rewards/margins": 5.6340460777282715, "rewards/rejected": 0.37596359848976135, "step": 8112 }, { "epoch": 1.8, "learning_rate": 2.711895287106275e-07, "logits/chosen": -1.69039785861969, "logits/rejected": -1.5327811241149902, "logps/chosen": -45.50249481201172, "logps/rejected": -31.745929718017578, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 3.130263566970825, "rewards/margins": -0.9837672710418701, "rewards/rejected": 4.114030838012695, "step": 8113 }, { "epoch": 1.8, "learning_rate": 2.7060757672996485e-07, "logits/chosen": -2.253769874572754, "logits/rejected": -2.211977481842041, "logps/chosen": -67.01451110839844, "logps/rejected": -77.4683609008789, "loss": 0.1282, "rewards/accuracies": 1.0, "rewards/chosen": 10.761387825012207, "rewards/margins": 1.2448005676269531, "rewards/rejected": 9.516587257385254, "step": 8114 }, { "epoch": 1.8, "learning_rate": 2.700262324648589e-07, "logits/chosen": -1.8388371467590332, "logits/rejected": -1.8080415725708008, "logps/chosen": -42.70701599121094, "logps/rejected": -53.426612854003906, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 4.259778022766113, "rewards/margins": 1.4577462673187256, "rewards/rejected": 2.8020317554473877, "step": 8115 }, { "epoch": 1.8, "learning_rate": 2.6944549599001167e-07, "logits/chosen": -1.904494047164917, "logits/rejected": -1.9014629125595093, "logps/chosen": -46.994651794433594, "logps/rejected": -71.8045654296875, "loss": 0.0653, "rewards/accuracies": 1.0, "rewards/chosen": 4.408059120178223, "rewards/margins": 2.2581543922424316, "rewards/rejected": 2.149904727935791, "step": 8116 }, { "epoch": 1.8, "learning_rate": 2.688653673800462e-07, "logits/chosen": -2.0628466606140137, "logits/rejected": -2.003371238708496, "logps/chosen": -112.88847351074219, "logps/rejected": -35.62173843383789, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 7.806071758270264, "rewards/margins": 6.512231826782227, "rewards/rejected": 1.293839693069458, "step": 8117 }, { "epoch": 1.8, "learning_rate": 2.682858467095079e-07, "logits/chosen": -1.9672799110412598, "logits/rejected": -1.9294958114624023, "logps/chosen": -74.71499633789062, "logps/rejected": -32.29850769042969, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": 6.0962982177734375, "rewards/margins": 1.9887428283691406, "rewards/rejected": 4.107555389404297, "step": 8118 }, { "epoch": 1.8, "learning_rate": 2.6770693405286375e-07, "logits/chosen": -2.0379059314727783, "logits/rejected": -1.7053362131118774, "logps/chosen": -150.3187255859375, "logps/rejected": -37.570701599121094, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 7.929725646972656, "rewards/margins": 5.012126922607422, "rewards/rejected": 2.9175987243652344, "step": 8119 }, { "epoch": 1.8, "learning_rate": 2.6712862948450156e-07, "logits/chosen": -2.054852247238159, "logits/rejected": -2.136096239089966, "logps/chosen": -66.87442016601562, "logps/rejected": -144.0695343017578, "loss": 1.1357, "rewards/accuracies": 0.0, "rewards/chosen": 9.444205284118652, "rewards/margins": -2.152888298034668, "rewards/rejected": 11.59709358215332, "step": 8120 }, { "epoch": 1.8, "learning_rate": 2.6655093307873416e-07, "logits/chosen": -1.6513500213623047, "logits/rejected": -1.6135258674621582, "logps/chosen": -89.04743957519531, "logps/rejected": -81.06402587890625, "loss": 0.0675, "rewards/accuracies": 1.0, "rewards/chosen": 5.844933986663818, "rewards/margins": 2.1625776290893555, "rewards/rejected": 3.682356357574463, "step": 8121 }, { "epoch": 1.8, "learning_rate": 2.6597384490979207e-07, "logits/chosen": -1.9157862663269043, "logits/rejected": -1.9048490524291992, "logps/chosen": -65.2843017578125, "logps/rejected": -93.13294982910156, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 5.979041576385498, "rewards/margins": 3.6581919193267822, "rewards/rejected": 2.320849657058716, "step": 8122 }, { "epoch": 1.8, "learning_rate": 2.65397365051831e-07, "logits/chosen": -1.8734220266342163, "logits/rejected": -1.8734220266342163, "logps/chosen": -55.07563400268555, "logps/rejected": -55.07563400268555, "loss": 0.3467, "rewards/accuracies": 0.0, "rewards/chosen": 5.501498222351074, "rewards/margins": 0.0, "rewards/rejected": 5.501498222351074, "step": 8123 }, { "epoch": 1.8, "learning_rate": 2.648214935789267e-07, "logits/chosen": -1.9250500202178955, "logits/rejected": -1.872991919517517, "logps/chosen": -45.05982208251953, "logps/rejected": -22.067771911621094, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": 4.6409125328063965, "rewards/margins": 2.5965735912323, "rewards/rejected": 2.0443389415740967, "step": 8124 }, { "epoch": 1.8, "learning_rate": 2.642462305650778e-07, "logits/chosen": -1.83963942527771, "logits/rejected": -1.7501193284988403, "logps/chosen": -65.94602966308594, "logps/rejected": -95.2988052368164, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 9.267168045043945, "rewards/margins": 7.397436141967773, "rewards/rejected": 1.8697319030761719, "step": 8125 }, { "epoch": 1.8, "learning_rate": 2.6367157608420347e-07, "logits/chosen": -1.975492000579834, "logits/rejected": -1.9912389516830444, "logps/chosen": -99.86536407470703, "logps/rejected": -118.4106674194336, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": 7.1408257484436035, "rewards/margins": 2.3765883445739746, "rewards/rejected": 4.764237403869629, "step": 8126 }, { "epoch": 1.8, "learning_rate": 2.630975302101457e-07, "logits/chosen": -1.899582028388977, "logits/rejected": -1.9008314609527588, "logps/chosen": -30.595346450805664, "logps/rejected": -42.57434844970703, "loss": 0.3007, "rewards/accuracies": 1.0, "rewards/chosen": 3.967494249343872, "rewards/margins": 0.3097357749938965, "rewards/rejected": 3.6577584743499756, "step": 8127 }, { "epoch": 1.8, "learning_rate": 2.6252409301666783e-07, "logits/chosen": -2.45400071144104, "logits/rejected": -2.466299295425415, "logps/chosen": -82.05899047851562, "logps/rejected": -29.39126968383789, "loss": 0.1223, "rewards/accuracies": 1.0, "rewards/chosen": 4.5693254470825195, "rewards/margins": 1.2918312549591064, "rewards/rejected": 3.277494192123413, "step": 8128 }, { "epoch": 1.8, "learning_rate": 2.619512645774558e-07, "logits/chosen": -1.786356806755066, "logits/rejected": -1.7974956035614014, "logps/chosen": -52.278968811035156, "logps/rejected": -63.06390380859375, "loss": 0.3438, "rewards/accuracies": 1.0, "rewards/chosen": 3.0454177856445312, "rewards/margins": 0.07868266105651855, "rewards/rejected": 2.9667351245880127, "step": 8129 }, { "epoch": 1.8, "learning_rate": 2.613790449661174e-07, "logits/chosen": -1.9434208869934082, "logits/rejected": -1.915628433227539, "logps/chosen": -35.518638610839844, "logps/rejected": -52.57585144042969, "loss": 0.3616, "rewards/accuracies": 1.0, "rewards/chosen": 2.6300065517425537, "rewards/margins": 0.8178979158401489, "rewards/rejected": 1.8121086359024048, "step": 8130 }, { "epoch": 1.8, "learning_rate": 2.6080743425617895e-07, "logits/chosen": -2.0331883430480957, "logits/rejected": -1.978910207748413, "logps/chosen": -34.8118896484375, "logps/rejected": -7.179470062255859, "loss": 0.0504, "rewards/accuracies": 1.0, "rewards/chosen": 3.4236221313476562, "rewards/margins": 2.2440333366394043, "rewards/rejected": 1.1795886754989624, "step": 8131 }, { "epoch": 1.8, "learning_rate": 2.602364325210932e-07, "logits/chosen": -1.9200854301452637, "logits/rejected": -1.9200854301452637, "logps/chosen": -51.64569854736328, "logps/rejected": -51.64569854736328, "loss": 0.3522, "rewards/accuracies": 0.0, "rewards/chosen": 3.71990966796875, "rewards/margins": 0.0, "rewards/rejected": 3.71990966796875, "step": 8132 }, { "epoch": 1.8, "learning_rate": 2.5966603983423155e-07, "logits/chosen": -1.7029439210891724, "logits/rejected": -1.7029439210891724, "logps/chosen": -10.836193084716797, "logps/rejected": -10.836193084716797, "loss": 0.4527, "rewards/accuracies": 0.0, "rewards/chosen": 3.222407102584839, "rewards/margins": 0.0, "rewards/rejected": 3.222407102584839, "step": 8133 }, { "epoch": 1.8, "learning_rate": 2.5909625626888813e-07, "logits/chosen": -2.1665027141571045, "logits/rejected": -2.1783015727996826, "logps/chosen": -37.2536506652832, "logps/rejected": -33.178314208984375, "loss": 0.3171, "rewards/accuracies": 1.0, "rewards/chosen": 2.4554035663604736, "rewards/margins": 0.1262505054473877, "rewards/rejected": 2.329153060913086, "step": 8134 }, { "epoch": 1.8, "learning_rate": 2.585270818982788e-07, "logits/chosen": -2.062966823577881, "logits/rejected": -2.088620901107788, "logps/chosen": -100.82081604003906, "logps/rejected": -67.11515808105469, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": 9.59399127960205, "rewards/margins": 2.552804470062256, "rewards/rejected": 7.041186809539795, "step": 8135 }, { "epoch": 1.8, "learning_rate": 2.5795851679554174e-07, "logits/chosen": -2.074702024459839, "logits/rejected": -2.022923707962036, "logps/chosen": -42.812095642089844, "logps/rejected": -18.76268768310547, "loss": 1.0092, "rewards/accuracies": 1.0, "rewards/chosen": 3.175133466720581, "rewards/margins": 2.0510401725769043, "rewards/rejected": 1.1240934133529663, "step": 8136 }, { "epoch": 1.8, "learning_rate": 2.573905610337357e-07, "logits/chosen": -2.2609915733337402, "logits/rejected": -2.22220516204834, "logps/chosen": -51.542694091796875, "logps/rejected": -136.74588012695312, "loss": 0.1467, "rewards/accuracies": 1.0, "rewards/chosen": 10.609214782714844, "rewards/margins": 1.0808696746826172, "rewards/rejected": 9.528345108032227, "step": 8137 }, { "epoch": 1.8, "learning_rate": 2.5682321468584184e-07, "logits/chosen": -2.0783798694610596, "logits/rejected": -2.073751211166382, "logps/chosen": -91.01696014404297, "logps/rejected": -41.108787536621094, "loss": 0.101, "rewards/accuracies": 1.0, "rewards/chosen": 5.419764041900635, "rewards/margins": 1.7678413391113281, "rewards/rejected": 3.6519227027893066, "step": 8138 }, { "epoch": 1.8, "learning_rate": 2.562564778247617e-07, "logits/chosen": -1.826785683631897, "logits/rejected": -1.8733271360397339, "logps/chosen": -48.92652130126953, "logps/rejected": -54.03443145751953, "loss": 0.5442, "rewards/accuracies": 1.0, "rewards/chosen": 3.925832509994507, "rewards/margins": 0.23897337913513184, "rewards/rejected": 3.686859130859375, "step": 8139 }, { "epoch": 1.8, "learning_rate": 2.556903505233216e-07, "logits/chosen": -2.003858804702759, "logits/rejected": -1.8805190324783325, "logps/chosen": -107.26176452636719, "logps/rejected": -33.87486267089844, "loss": 0.096, "rewards/accuracies": 1.0, "rewards/chosen": 2.871495008468628, "rewards/margins": 3.1786530017852783, "rewards/rejected": -0.30715808272361755, "step": 8140 }, { "epoch": 1.8, "learning_rate": 2.5512483285426606e-07, "logits/chosen": -1.998672366142273, "logits/rejected": -1.8836820125579834, "logps/chosen": -117.34893035888672, "logps/rejected": -63.12517547607422, "loss": 0.2483, "rewards/accuracies": 1.0, "rewards/chosen": 6.698348522186279, "rewards/margins": 3.5427005290985107, "rewards/rejected": 3.1556479930877686, "step": 8141 }, { "epoch": 1.8, "learning_rate": 2.5455992489026303e-07, "logits/chosen": -1.6022738218307495, "logits/rejected": -1.6022738218307495, "logps/chosen": -15.526097297668457, "logps/rejected": -15.526097297668457, "loss": 0.457, "rewards/accuracies": 0.0, "rewards/chosen": 3.336919069290161, "rewards/margins": 0.0, "rewards/rejected": 3.336919069290161, "step": 8142 }, { "epoch": 1.8, "learning_rate": 2.539956267039018e-07, "logits/chosen": -2.089362859725952, "logits/rejected": -2.089362859725952, "logps/chosen": -79.45236206054688, "logps/rejected": -79.45236206054688, "loss": 0.3526, "rewards/accuracies": 0.0, "rewards/chosen": 8.251126289367676, "rewards/margins": 0.0, "rewards/rejected": 8.251126289367676, "step": 8143 }, { "epoch": 1.8, "learning_rate": 2.534319383676936e-07, "logits/chosen": -2.0015530586242676, "logits/rejected": -1.9749963283538818, "logps/chosen": -7.2553558349609375, "logps/rejected": -17.802783966064453, "loss": 0.3688, "rewards/accuracies": 1.0, "rewards/chosen": 2.113424062728882, "rewards/margins": 0.6709847450256348, "rewards/rejected": 1.442439317703247, "step": 8144 }, { "epoch": 1.8, "learning_rate": 2.5286885995407076e-07, "logits/chosen": -2.1540794372558594, "logits/rejected": -2.180830478668213, "logps/chosen": -20.979278564453125, "logps/rejected": -79.56317138671875, "loss": 0.7762, "rewards/accuracies": 0.0, "rewards/chosen": 4.329030513763428, "rewards/margins": -0.06452751159667969, "rewards/rejected": 4.393558025360107, "step": 8145 }, { "epoch": 1.8, "learning_rate": 2.523063915353874e-07, "logits/chosen": -2.0834178924560547, "logits/rejected": -2.0235092639923096, "logps/chosen": -202.45623779296875, "logps/rejected": -81.95634460449219, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": 9.0193452835083, "rewards/margins": 4.415119171142578, "rewards/rejected": 4.604226112365723, "step": 8146 }, { "epoch": 1.8, "learning_rate": 2.517445331839208e-07, "logits/chosen": -1.9915302991867065, "logits/rejected": -1.998109221458435, "logps/chosen": -50.78809356689453, "logps/rejected": -49.329803466796875, "loss": 0.9409, "rewards/accuracies": 0.0, "rewards/chosen": 5.271442413330078, "rewards/margins": -0.825355052947998, "rewards/rejected": 6.096797466278076, "step": 8147 }, { "epoch": 1.8, "learning_rate": 2.511832849718654e-07, "logits/chosen": -1.9057261943817139, "logits/rejected": -1.9015686511993408, "logps/chosen": -60.15713882446289, "logps/rejected": -64.29618835449219, "loss": 0.373, "rewards/accuracies": 1.0, "rewards/chosen": 3.5690770149230957, "rewards/margins": 0.06595492362976074, "rewards/rejected": 3.503122091293335, "step": 8148 }, { "epoch": 1.8, "learning_rate": 2.506226469713435e-07, "logits/chosen": -2.3064098358154297, "logits/rejected": -2.287510395050049, "logps/chosen": -110.19143676757812, "logps/rejected": -47.56047821044922, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": 6.368844509124756, "rewards/margins": 3.306537389755249, "rewards/rejected": 3.062307119369507, "step": 8149 }, { "epoch": 1.8, "learning_rate": 2.5006261925439367e-07, "logits/chosen": -2.1205825805664062, "logits/rejected": -2.1301050186157227, "logps/chosen": -46.131874084472656, "logps/rejected": -48.34465408325195, "loss": 0.1017, "rewards/accuracies": 1.0, "rewards/chosen": 5.740825653076172, "rewards/margins": 1.9564096927642822, "rewards/rejected": 3.7844159603118896, "step": 8150 }, { "epoch": 1.8, "learning_rate": 2.4950320189297884e-07, "logits/chosen": -1.7817838191986084, "logits/rejected": -1.8305463790893555, "logps/chosen": -52.79075622558594, "logps/rejected": -87.13060760498047, "loss": 0.7873, "rewards/accuracies": 0.0, "rewards/chosen": 4.542782783508301, "rewards/margins": -0.9659337997436523, "rewards/rejected": 5.508716583251953, "step": 8151 }, { "epoch": 1.8, "learning_rate": 2.489443949589826e-07, "logits/chosen": -1.7499638795852661, "logits/rejected": -1.8261440992355347, "logps/chosen": -48.171382904052734, "logps/rejected": -75.26737976074219, "loss": 2.047, "rewards/accuracies": 0.0, "rewards/chosen": 4.923374652862549, "rewards/margins": -3.6281685829162598, "rewards/rejected": 8.551543235778809, "step": 8152 }, { "epoch": 1.8, "learning_rate": 2.483861985242103e-07, "logits/chosen": -1.9571181535720825, "logits/rejected": -1.9289788007736206, "logps/chosen": -29.826976776123047, "logps/rejected": -48.68583679199219, "loss": 0.5317, "rewards/accuracies": 0.0, "rewards/chosen": 3.668013334274292, "rewards/margins": -0.5318248271942139, "rewards/rejected": 4.199838161468506, "step": 8153 }, { "epoch": 1.8, "learning_rate": 2.4782861266038904e-07, "logits/chosen": -1.7709261178970337, "logits/rejected": -1.7130850553512573, "logps/chosen": -72.692138671875, "logps/rejected": -82.74740600585938, "loss": 0.0512, "rewards/accuracies": 1.0, "rewards/chosen": 6.083964824676514, "rewards/margins": 2.374335765838623, "rewards/rejected": 3.7096290588378906, "step": 8154 }, { "epoch": 1.81, "learning_rate": 2.47271637439167e-07, "logits/chosen": -1.8059415817260742, "logits/rejected": -1.7176332473754883, "logps/chosen": -116.89031982421875, "logps/rejected": -48.8212890625, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": 6.719259738922119, "rewards/margins": 2.6081457138061523, "rewards/rejected": 4.111114025115967, "step": 8155 }, { "epoch": 1.81, "learning_rate": 2.4671527293211537e-07, "logits/chosen": -1.7294245958328247, "logits/rejected": -1.795041561126709, "logps/chosen": -41.04224395751953, "logps/rejected": -104.06666564941406, "loss": 0.551, "rewards/accuracies": 0.0, "rewards/chosen": 3.5621583461761475, "rewards/margins": -0.6100828647613525, "rewards/rejected": 4.1722412109375, "step": 8156 }, { "epoch": 1.81, "learning_rate": 2.4615951921072303e-07, "logits/chosen": -1.8409523963928223, "logits/rejected": -1.8409523963928223, "logps/chosen": -30.266830444335938, "logps/rejected": -30.266830444335938, "loss": 0.7804, "rewards/accuracies": 0.0, "rewards/chosen": 2.6243321895599365, "rewards/margins": 0.0, "rewards/rejected": 2.6243321895599365, "step": 8157 }, { "epoch": 1.81, "learning_rate": 2.456043763464061e-07, "logits/chosen": -1.8146066665649414, "logits/rejected": -1.7927770614624023, "logps/chosen": -15.441669464111328, "logps/rejected": -32.705360412597656, "loss": 0.7334, "rewards/accuracies": 0.0, "rewards/chosen": 2.0263211727142334, "rewards/margins": -1.2006144523620605, "rewards/rejected": 3.226935625076294, "step": 8158 }, { "epoch": 1.81, "learning_rate": 2.450498444104954e-07, "logits/chosen": -2.033613443374634, "logits/rejected": -1.9845280647277832, "logps/chosen": -103.19528198242188, "logps/rejected": -91.41036987304688, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": 8.764021873474121, "rewards/margins": 3.0525898933410645, "rewards/rejected": 5.711431980133057, "step": 8159 }, { "epoch": 1.81, "learning_rate": 2.4449592347425113e-07, "logits/chosen": -1.9264498949050903, "logits/rejected": -1.8022129535675049, "logps/chosen": -105.45881652832031, "logps/rejected": -40.06936264038086, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": 8.248954772949219, "rewards/margins": 5.546378135681152, "rewards/rejected": 2.7025768756866455, "step": 8160 }, { "epoch": 1.81, "learning_rate": 2.4394261360884797e-07, "logits/chosen": -1.707240343093872, "logits/rejected": -1.3204036951065063, "logps/chosen": -87.93214416503906, "logps/rejected": -86.14273071289062, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 8.287602424621582, "rewards/margins": 5.82353401184082, "rewards/rejected": 2.464068651199341, "step": 8161 }, { "epoch": 1.81, "learning_rate": 2.4338991488538577e-07, "logits/chosen": -2.1844799518585205, "logits/rejected": -2.2027370929718018, "logps/chosen": -72.73743438720703, "logps/rejected": -85.53436279296875, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": 8.773909568786621, "rewards/margins": 2.5236258506774902, "rewards/rejected": 6.250283718109131, "step": 8162 }, { "epoch": 1.81, "learning_rate": 2.4283782737488495e-07, "logits/chosen": -2.049621343612671, "logits/rejected": -1.9989914894104004, "logps/chosen": -52.34764862060547, "logps/rejected": -74.30123901367188, "loss": 0.124, "rewards/accuracies": 1.0, "rewards/chosen": 9.887897491455078, "rewards/margins": 1.4756736755371094, "rewards/rejected": 8.412223815917969, "step": 8163 }, { "epoch": 1.81, "learning_rate": 2.422863511482876e-07, "logits/chosen": -2.0853805541992188, "logits/rejected": -2.1073150634765625, "logps/chosen": -37.176143646240234, "logps/rejected": -64.93722534179688, "loss": 1.7926, "rewards/accuracies": 0.0, "rewards/chosen": 2.9279630184173584, "rewards/margins": -2.8349597454071045, "rewards/rejected": 5.762922763824463, "step": 8164 }, { "epoch": 1.81, "learning_rate": 2.41735486276457e-07, "logits/chosen": -1.9613384008407593, "logits/rejected": -1.915443778038025, "logps/chosen": -82.18285369873047, "logps/rejected": -63.23966598510742, "loss": 0.2874, "rewards/accuracies": 1.0, "rewards/chosen": 4.763404369354248, "rewards/margins": 0.26499462127685547, "rewards/rejected": 4.498409748077393, "step": 8165 }, { "epoch": 1.81, "learning_rate": 2.411852328301773e-07, "logits/chosen": -1.8691976070404053, "logits/rejected": -1.861043930053711, "logps/chosen": -48.87853240966797, "logps/rejected": -71.02629089355469, "loss": 0.2163, "rewards/accuracies": 1.0, "rewards/chosen": 4.935549259185791, "rewards/margins": 0.6612052917480469, "rewards/rejected": 4.274343967437744, "step": 8166 }, { "epoch": 1.81, "learning_rate": 2.4063559088015607e-07, "logits/chosen": -1.9906679391860962, "logits/rejected": -1.9973185062408447, "logps/chosen": -61.48126220703125, "logps/rejected": -105.14459228515625, "loss": 0.0968, "rewards/accuracies": 1.0, "rewards/chosen": 8.945652961730957, "rewards/margins": 2.2771286964416504, "rewards/rejected": 6.668524265289307, "step": 8167 }, { "epoch": 1.81, "learning_rate": 2.4008656049701875e-07, "logits/chosen": -1.5017871856689453, "logits/rejected": -1.4580765962600708, "logps/chosen": -70.46992492675781, "logps/rejected": -70.1856689453125, "loss": 0.1709, "rewards/accuracies": 1.0, "rewards/chosen": 4.0838518142700195, "rewards/margins": 1.8019945621490479, "rewards/rejected": 2.2818572521209717, "step": 8168 }, { "epoch": 1.81, "learning_rate": 2.395381417513176e-07, "logits/chosen": -1.9152437448501587, "logits/rejected": -1.8433150053024292, "logps/chosen": -45.71138381958008, "logps/rejected": -29.906761169433594, "loss": 0.1302, "rewards/accuracies": 1.0, "rewards/chosen": 4.329658031463623, "rewards/margins": 2.6082820892333984, "rewards/rejected": 1.7213760614395142, "step": 8169 }, { "epoch": 1.81, "learning_rate": 2.3899033471352027e-07, "logits/chosen": -1.7241880893707275, "logits/rejected": -1.7241880893707275, "logps/chosen": -7.13237190246582, "logps/rejected": -7.13237190246582, "loss": 0.3591, "rewards/accuracies": 0.0, "rewards/chosen": 3.1428422927856445, "rewards/margins": 0.0, "rewards/rejected": 3.1428422927856445, "step": 8170 }, { "epoch": 1.81, "learning_rate": 2.384431394540204e-07, "logits/chosen": -1.9032886028289795, "logits/rejected": -1.892810344696045, "logps/chosen": -87.3013687133789, "logps/rejected": -97.62210083007812, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": 7.041591644287109, "rewards/margins": 2.3451333045959473, "rewards/rejected": 4.696458339691162, "step": 8171 }, { "epoch": 1.81, "learning_rate": 2.378965560431301e-07, "logits/chosen": -1.7149945497512817, "logits/rejected": -1.6896835565567017, "logps/chosen": -34.88855743408203, "logps/rejected": -64.42384338378906, "loss": 0.163, "rewards/accuracies": 1.0, "rewards/chosen": 5.568965911865234, "rewards/margins": 0.970146656036377, "rewards/rejected": 4.598819255828857, "step": 8172 }, { "epoch": 1.81, "learning_rate": 2.3735058455108484e-07, "logits/chosen": -2.0011825561523438, "logits/rejected": -1.9316552877426147, "logps/chosen": -87.88935852050781, "logps/rejected": -68.5050048828125, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 6.97438383102417, "rewards/margins": 3.427704095840454, "rewards/rejected": 3.546679735183716, "step": 8173 }, { "epoch": 1.81, "learning_rate": 2.3680522504804128e-07, "logits/chosen": -1.9923996925354004, "logits/rejected": -1.929526448249817, "logps/chosen": -84.29676818847656, "logps/rejected": -65.58203125, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": 6.012528896331787, "rewards/margins": 2.6237425804138184, "rewards/rejected": 3.3887863159179688, "step": 8174 }, { "epoch": 1.81, "learning_rate": 2.3626047760407445e-07, "logits/chosen": -2.1622626781463623, "logits/rejected": -2.0748748779296875, "logps/chosen": -36.35002136230469, "logps/rejected": -20.28750991821289, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 3.1690003871917725, "rewards/margins": 2.2785825729370117, "rewards/rejected": 0.8904176950454712, "step": 8175 }, { "epoch": 1.81, "learning_rate": 2.357163422891856e-07, "logits/chosen": -2.0611612796783447, "logits/rejected": -2.0854928493499756, "logps/chosen": -26.897798538208008, "logps/rejected": -62.59127426147461, "loss": 0.2293, "rewards/accuracies": 1.0, "rewards/chosen": 3.1758742332458496, "rewards/margins": 0.6887781620025635, "rewards/rejected": 2.487096071243286, "step": 8176 }, { "epoch": 1.81, "learning_rate": 2.351728191732927e-07, "logits/chosen": -1.8061820268630981, "logits/rejected": -1.8646271228790283, "logps/chosen": -41.686866760253906, "logps/rejected": -80.40188598632812, "loss": 0.35, "rewards/accuracies": 1.0, "rewards/chosen": 5.250673770904541, "rewards/margins": 0.0634775161743164, "rewards/rejected": 5.187196254730225, "step": 8177 }, { "epoch": 1.81, "learning_rate": 2.3462990832623933e-07, "logits/chosen": -1.778483510017395, "logits/rejected": -1.731702446937561, "logps/chosen": -74.71955108642578, "logps/rejected": -43.4414176940918, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": 5.783077239990234, "rewards/margins": 2.623657464981079, "rewards/rejected": 3.1594197750091553, "step": 8178 }, { "epoch": 1.81, "learning_rate": 2.3408760981778634e-07, "logits/chosen": -2.253726005554199, "logits/rejected": -2.164668560028076, "logps/chosen": -48.76355743408203, "logps/rejected": -38.781280517578125, "loss": 0.1936, "rewards/accuracies": 1.0, "rewards/chosen": 5.113064765930176, "rewards/margins": 0.8758630752563477, "rewards/rejected": 4.237201690673828, "step": 8179 }, { "epoch": 1.81, "learning_rate": 2.3354592371761852e-07, "logits/chosen": -1.9645017385482788, "logits/rejected": -1.945628046989441, "logps/chosen": -23.955604553222656, "logps/rejected": -63.50660705566406, "loss": 0.3379, "rewards/accuracies": 1.0, "rewards/chosen": 3.9380481243133545, "rewards/margins": 0.6935319900512695, "rewards/rejected": 3.244516134262085, "step": 8180 }, { "epoch": 1.81, "learning_rate": 2.330048500953419e-07, "logits/chosen": -2.0844712257385254, "logits/rejected": -2.15120792388916, "logps/chosen": -67.04139709472656, "logps/rejected": -67.06269073486328, "loss": 0.1293, "rewards/accuracies": 1.0, "rewards/chosen": 10.543062210083008, "rewards/margins": 1.504664421081543, "rewards/rejected": 9.038397789001465, "step": 8181 }, { "epoch": 1.81, "learning_rate": 2.3246438902048196e-07, "logits/chosen": -2.155466079711914, "logits/rejected": -2.115485668182373, "logps/chosen": -152.9460906982422, "logps/rejected": -73.62269592285156, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": 8.727785110473633, "rewards/margins": 1.7311038970947266, "rewards/rejected": 6.996681213378906, "step": 8182 }, { "epoch": 1.81, "learning_rate": 2.319245405624876e-07, "logits/chosen": -2.0903351306915283, "logits/rejected": -2.017263174057007, "logps/chosen": -120.10321044921875, "logps/rejected": -80.33795166015625, "loss": 0.0777, "rewards/accuracies": 1.0, "rewards/chosen": 7.688360691070557, "rewards/margins": 2.2465667724609375, "rewards/rejected": 5.441793918609619, "step": 8183 }, { "epoch": 1.81, "learning_rate": 2.3138530479072664e-07, "logits/chosen": -1.8189328908920288, "logits/rejected": -1.8189328908920288, "logps/chosen": -46.04646682739258, "logps/rejected": -46.04646682739258, "loss": 0.3481, "rewards/accuracies": 0.0, "rewards/chosen": 3.981363296508789, "rewards/margins": 0.0, "rewards/rejected": 3.981363296508789, "step": 8184 }, { "epoch": 1.81, "learning_rate": 2.30846681774492e-07, "logits/chosen": -2.0761332511901855, "logits/rejected": -2.0705137252807617, "logps/chosen": -34.40666198730469, "logps/rejected": -26.25406265258789, "loss": 0.4292, "rewards/accuracies": 0.0, "rewards/chosen": 2.644796848297119, "rewards/margins": -0.08894872665405273, "rewards/rejected": 2.733745574951172, "step": 8185 }, { "epoch": 1.81, "learning_rate": 2.3030867158299274e-07, "logits/chosen": -1.8942922353744507, "logits/rejected": -1.8942922353744507, "logps/chosen": -55.287147521972656, "logps/rejected": -55.287147521972656, "loss": 0.3521, "rewards/accuracies": 0.0, "rewards/chosen": 4.000411510467529, "rewards/margins": 0.0, "rewards/rejected": 4.000411510467529, "step": 8186 }, { "epoch": 1.81, "learning_rate": 2.2977127428536473e-07, "logits/chosen": -1.5558152198791504, "logits/rejected": -1.4476326704025269, "logps/chosen": -74.91551208496094, "logps/rejected": -39.969478607177734, "loss": 0.2006, "rewards/accuracies": 1.0, "rewards/chosen": 5.911374092102051, "rewards/margins": 2.2692930698394775, "rewards/rejected": 3.6420810222625732, "step": 8187 }, { "epoch": 1.81, "learning_rate": 2.2923448995065933e-07, "logits/chosen": -2.0984785556793213, "logits/rejected": -2.1360840797424316, "logps/chosen": -45.045345306396484, "logps/rejected": -98.01416015625, "loss": 0.4305, "rewards/accuracies": 0.0, "rewards/chosen": 7.734457492828369, "rewards/margins": -0.2615327835083008, "rewards/rejected": 7.99599027633667, "step": 8188 }, { "epoch": 1.81, "learning_rate": 2.2869831864785364e-07, "logits/chosen": -2.1116769313812256, "logits/rejected": -2.1055757999420166, "logps/chosen": -48.45201110839844, "logps/rejected": -67.65606689453125, "loss": 0.1998, "rewards/accuracies": 1.0, "rewards/chosen": 4.316640377044678, "rewards/margins": 0.7465696334838867, "rewards/rejected": 3.570070743560791, "step": 8189 }, { "epoch": 1.81, "learning_rate": 2.281627604458442e-07, "logits/chosen": -1.8848830461502075, "logits/rejected": -1.8445466756820679, "logps/chosen": -40.05491638183594, "logps/rejected": -28.55658721923828, "loss": 0.2572, "rewards/accuracies": 1.0, "rewards/chosen": 4.754615306854248, "rewards/margins": 0.5684976577758789, "rewards/rejected": 4.186117649078369, "step": 8190 }, { "epoch": 1.81, "learning_rate": 2.276278154134487e-07, "logits/chosen": -1.815375804901123, "logits/rejected": -1.7887643575668335, "logps/chosen": -31.6834659576416, "logps/rejected": -31.662643432617188, "loss": 0.2509, "rewards/accuracies": 1.0, "rewards/chosen": 2.668592691421509, "rewards/margins": 0.7100332975387573, "rewards/rejected": 1.9585593938827515, "step": 8191 }, { "epoch": 1.81, "learning_rate": 2.270934836194072e-07, "logits/chosen": -1.7903668880462646, "logits/rejected": -1.7546870708465576, "logps/chosen": -35.570648193359375, "logps/rejected": -55.9451904296875, "loss": 0.2718, "rewards/accuracies": 1.0, "rewards/chosen": 3.919996738433838, "rewards/margins": 0.9228591918945312, "rewards/rejected": 2.9971375465393066, "step": 8192 }, { "epoch": 1.81, "learning_rate": 2.265597651323781e-07, "logits/chosen": -2.1446497440338135, "logits/rejected": -2.1446497440338135, "logps/chosen": -54.49327087402344, "logps/rejected": -54.49327087402344, "loss": 0.3955, "rewards/accuracies": 0.0, "rewards/chosen": 8.270750999450684, "rewards/margins": 0.0, "rewards/rejected": 8.270750999450684, "step": 8193 }, { "epoch": 1.81, "learning_rate": 2.2602666002094543e-07, "logits/chosen": -1.9331327676773071, "logits/rejected": -1.8699079751968384, "logps/chosen": -84.89982604980469, "logps/rejected": -52.340877532958984, "loss": 0.2735, "rewards/accuracies": 1.0, "rewards/chosen": 4.387075901031494, "rewards/margins": 1.7684781551361084, "rewards/rejected": 2.6185977458953857, "step": 8194 }, { "epoch": 1.81, "learning_rate": 2.2549416835360938e-07, "logits/chosen": -2.2741475105285645, "logits/rejected": -2.2866759300231934, "logps/chosen": -45.72597885131836, "logps/rejected": -27.479270935058594, "loss": 0.3339, "rewards/accuracies": 1.0, "rewards/chosen": 2.7028729915618896, "rewards/margins": 1.4992787837982178, "rewards/rejected": 1.2035942077636719, "step": 8195 }, { "epoch": 1.81, "learning_rate": 2.2496229019879635e-07, "logits/chosen": -2.230375289916992, "logits/rejected": -2.1775283813476562, "logps/chosen": -35.20685577392578, "logps/rejected": -47.1871452331543, "loss": 0.5789, "rewards/accuracies": 0.0, "rewards/chosen": 3.558650255203247, "rewards/margins": -0.7254369258880615, "rewards/rejected": 4.284087181091309, "step": 8196 }, { "epoch": 1.81, "learning_rate": 2.2443102562484942e-07, "logits/chosen": -1.7947213649749756, "logits/rejected": -1.7676459550857544, "logps/chosen": -60.18606185913086, "logps/rejected": -82.38140869140625, "loss": 0.1674, "rewards/accuracies": 1.0, "rewards/chosen": 3.1094868183135986, "rewards/margins": 1.542662501335144, "rewards/rejected": 1.5668243169784546, "step": 8197 }, { "epoch": 1.81, "learning_rate": 2.2390037470003567e-07, "logits/chosen": -1.8450641632080078, "logits/rejected": -1.7443554401397705, "logps/chosen": -50.600982666015625, "logps/rejected": -21.809133529663086, "loss": 0.4677, "rewards/accuracies": 0.0, "rewards/chosen": 1.0042724609375, "rewards/margins": -0.4344831705093384, "rewards/rejected": 1.4387556314468384, "step": 8198 }, { "epoch": 1.81, "learning_rate": 2.2337033749254222e-07, "logits/chosen": -1.734446406364441, "logits/rejected": -1.7226171493530273, "logps/chosen": -29.676387786865234, "logps/rejected": -67.88485717773438, "loss": 0.6054, "rewards/accuracies": 0.0, "rewards/chosen": 2.8856799602508545, "rewards/margins": -0.831568717956543, "rewards/rejected": 3.7172486782073975, "step": 8199 }, { "epoch": 1.81, "learning_rate": 2.2284091407047791e-07, "logits/chosen": -1.7659558057785034, "logits/rejected": -1.7659558057785034, "logps/chosen": -44.06727600097656, "logps/rejected": -44.06727600097656, "loss": 0.6212, "rewards/accuracies": 0.0, "rewards/chosen": 3.1737053394317627, "rewards/margins": 0.0, "rewards/rejected": 3.1737053394317627, "step": 8200 }, { "epoch": 1.82, "learning_rate": 2.2231210450187223e-07, "logits/chosen": -2.225818634033203, "logits/rejected": -2.2042043209075928, "logps/chosen": -50.79351806640625, "logps/rejected": -68.8846206665039, "loss": 0.2856, "rewards/accuracies": 1.0, "rewards/chosen": 3.9722962379455566, "rewards/margins": 0.7444343566894531, "rewards/rejected": 3.2278618812561035, "step": 8201 }, { "epoch": 1.82, "learning_rate": 2.2178390885467526e-07, "logits/chosen": -2.1464529037475586, "logits/rejected": -2.1536812782287598, "logps/chosen": -51.48481750488281, "logps/rejected": -60.02127456665039, "loss": 0.2319, "rewards/accuracies": 1.0, "rewards/chosen": 4.182498455047607, "rewards/margins": 0.7143347263336182, "rewards/rejected": 3.4681637287139893, "step": 8202 }, { "epoch": 1.82, "learning_rate": 2.212563271967605e-07, "logits/chosen": -2.062561273574829, "logits/rejected": -2.113452672958374, "logps/chosen": -39.87567138671875, "logps/rejected": -76.18327331542969, "loss": 1.6384, "rewards/accuracies": 0.0, "rewards/chosen": 5.1618242263793945, "rewards/margins": -1.3135604858398438, "rewards/rejected": 6.475384712219238, "step": 8203 }, { "epoch": 1.82, "learning_rate": 2.2072935959591867e-07, "logits/chosen": -2.1022818088531494, "logits/rejected": -2.170090913772583, "logps/chosen": -55.83946990966797, "logps/rejected": -96.50665283203125, "loss": 3.24, "rewards/accuracies": 0.0, "rewards/chosen": 3.5153603553771973, "rewards/margins": -4.45328426361084, "rewards/rejected": 7.968644618988037, "step": 8204 }, { "epoch": 1.82, "learning_rate": 2.2020300611986622e-07, "logits/chosen": -1.8729290962219238, "logits/rejected": -1.85704505443573, "logps/chosen": -55.47172546386719, "logps/rejected": -51.237388610839844, "loss": 0.1173, "rewards/accuracies": 1.0, "rewards/chosen": 3.9007461071014404, "rewards/margins": 1.8669188022613525, "rewards/rejected": 2.033827304840088, "step": 8205 }, { "epoch": 1.82, "learning_rate": 2.196772668362368e-07, "logits/chosen": -1.8360061645507812, "logits/rejected": -1.7881509065628052, "logps/chosen": -39.576148986816406, "logps/rejected": -36.378597259521484, "loss": 0.2183, "rewards/accuracies": 1.0, "rewards/chosen": 2.706096649169922, "rewards/margins": 0.8740650415420532, "rewards/rejected": 1.8320316076278687, "step": 8206 }, { "epoch": 1.82, "learning_rate": 2.1915214181258693e-07, "logits/chosen": -1.7951359748840332, "logits/rejected": -1.7639825344085693, "logps/chosen": -157.76101684570312, "logps/rejected": -80.28484344482422, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": 9.311144828796387, "rewards/margins": 3.7484912872314453, "rewards/rejected": 5.562653541564941, "step": 8207 }, { "epoch": 1.82, "learning_rate": 2.1862763111639374e-07, "logits/chosen": -1.7582807540893555, "logits/rejected": -1.6566818952560425, "logps/chosen": -30.701702117919922, "logps/rejected": -15.947480201721191, "loss": 0.18, "rewards/accuracies": 1.0, "rewards/chosen": 1.994198203086853, "rewards/margins": 0.8941961526870728, "rewards/rejected": 1.1000020503997803, "step": 8208 }, { "epoch": 1.82, "learning_rate": 2.181037348150561e-07, "logits/chosen": -1.733698844909668, "logits/rejected": -1.8743318319320679, "logps/chosen": -17.67946434020996, "logps/rejected": -44.586395263671875, "loss": 1.1308, "rewards/accuracies": 0.0, "rewards/chosen": 2.0918543338775635, "rewards/margins": -1.9575893878936768, "rewards/rejected": 4.04944372177124, "step": 8209 }, { "epoch": 1.82, "learning_rate": 2.175804529758929e-07, "logits/chosen": -1.9921987056732178, "logits/rejected": -1.9921987056732178, "logps/chosen": -19.366655349731445, "logps/rejected": -19.366655349731445, "loss": 0.4444, "rewards/accuracies": 0.0, "rewards/chosen": 3.7552719116210938, "rewards/margins": 0.0, "rewards/rejected": 3.7552719116210938, "step": 8210 }, { "epoch": 1.82, "learning_rate": 2.1705778566614543e-07, "logits/chosen": -2.0328917503356934, "logits/rejected": -2.050257921218872, "logps/chosen": -43.575904846191406, "logps/rejected": -69.01622009277344, "loss": 1.8397, "rewards/accuracies": 0.0, "rewards/chosen": 4.090301036834717, "rewards/margins": -2.629265308380127, "rewards/rejected": 6.719566345214844, "step": 8211 }, { "epoch": 1.82, "learning_rate": 2.165357329529749e-07, "logits/chosen": -1.7507840394973755, "logits/rejected": -1.3801277875900269, "logps/chosen": -36.644081115722656, "logps/rejected": -68.17353057861328, "loss": 0.1913, "rewards/accuracies": 1.0, "rewards/chosen": 3.5585649013519287, "rewards/margins": 0.8087882995605469, "rewards/rejected": 2.749776601791382, "step": 8212 }, { "epoch": 1.82, "learning_rate": 2.160142949034627e-07, "logits/chosen": -1.6921679973602295, "logits/rejected": -1.6921679973602295, "logps/chosen": -36.821434020996094, "logps/rejected": -36.821434020996094, "loss": 0.3496, "rewards/accuracies": 0.0, "rewards/chosen": 6.7027506828308105, "rewards/margins": 0.0, "rewards/rejected": 6.7027506828308105, "step": 8213 }, { "epoch": 1.82, "learning_rate": 2.1549347158461353e-07, "logits/chosen": -1.9229949712753296, "logits/rejected": -1.9776628017425537, "logps/chosen": -20.664188385009766, "logps/rejected": -67.03766632080078, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 4.82033109664917, "rewards/margins": 1.5744333267211914, "rewards/rejected": 3.2458977699279785, "step": 8214 }, { "epoch": 1.82, "learning_rate": 2.1497326306335164e-07, "logits/chosen": -1.7890591621398926, "logits/rejected": -1.6566412448883057, "logps/chosen": -118.59135437011719, "logps/rejected": -37.41973114013672, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 7.373979091644287, "rewards/margins": 5.075986385345459, "rewards/rejected": 2.297992706298828, "step": 8215 }, { "epoch": 1.82, "learning_rate": 2.1445366940652246e-07, "logits/chosen": -2.0660550594329834, "logits/rejected": -2.0742783546447754, "logps/chosen": -92.60533905029297, "logps/rejected": -135.44305419921875, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": 13.154650688171387, "rewards/margins": 3.4713315963745117, "rewards/rejected": 9.683319091796875, "step": 8216 }, { "epoch": 1.82, "learning_rate": 2.1393469068089257e-07, "logits/chosen": -1.842940092086792, "logits/rejected": -1.7456350326538086, "logps/chosen": -47.03181076049805, "logps/rejected": -38.00785827636719, "loss": 0.2173, "rewards/accuracies": 1.0, "rewards/chosen": 4.4239277839660645, "rewards/margins": 0.8429012298583984, "rewards/rejected": 3.581026554107666, "step": 8217 }, { "epoch": 1.82, "learning_rate": 2.1341632695314974e-07, "logits/chosen": -1.9956870079040527, "logits/rejected": -2.0160791873931885, "logps/chosen": -56.49050521850586, "logps/rejected": -66.69203186035156, "loss": 0.3596, "rewards/accuracies": 0.0, "rewards/chosen": 5.946921348571777, "rewards/margins": -0.049845218658447266, "rewards/rejected": 5.996766567230225, "step": 8218 }, { "epoch": 1.82, "learning_rate": 2.1289857828990236e-07, "logits/chosen": -1.8191866874694824, "logits/rejected": -1.757023811340332, "logps/chosen": -50.4621696472168, "logps/rejected": -57.90574264526367, "loss": 0.1583, "rewards/accuracies": 1.0, "rewards/chosen": 3.633331775665283, "rewards/margins": 1.6862579584121704, "rewards/rejected": 1.9470738172531128, "step": 8219 }, { "epoch": 1.82, "learning_rate": 2.123814447576794e-07, "logits/chosen": -2.0451862812042236, "logits/rejected": -1.9662621021270752, "logps/chosen": -116.69499206542969, "logps/rejected": -59.34151077270508, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 6.184092998504639, "rewards/margins": 3.3047995567321777, "rewards/rejected": 2.879293441772461, "step": 8220 }, { "epoch": 1.82, "learning_rate": 2.1186492642293223e-07, "logits/chosen": -1.9371546506881714, "logits/rejected": -1.9371546506881714, "logps/chosen": -53.11098861694336, "logps/rejected": -53.11098861694336, "loss": 0.7224, "rewards/accuracies": 0.0, "rewards/chosen": 3.951726198196411, "rewards/margins": 0.0, "rewards/rejected": 3.951726198196411, "step": 8221 }, { "epoch": 1.82, "learning_rate": 2.1134902335203155e-07, "logits/chosen": -1.7297909259796143, "logits/rejected": -1.6593525409698486, "logps/chosen": -59.198646545410156, "logps/rejected": -7.888344764709473, "loss": 0.2676, "rewards/accuracies": 1.0, "rewards/chosen": 3.023059129714966, "rewards/margins": 1.6436402797698975, "rewards/rejected": 1.3794188499450684, "step": 8222 }, { "epoch": 1.82, "learning_rate": 2.108337356112694e-07, "logits/chosen": -2.068235397338867, "logits/rejected": -1.9369958639144897, "logps/chosen": -131.65560913085938, "logps/rejected": -23.24080467224121, "loss": 0.1358, "rewards/accuracies": 1.0, "rewards/chosen": 4.416062831878662, "rewards/margins": 1.1747252941131592, "rewards/rejected": 3.241337537765503, "step": 8223 }, { "epoch": 1.82, "learning_rate": 2.1031906326685946e-07, "logits/chosen": -2.0217387676239014, "logits/rejected": -2.0007505416870117, "logps/chosen": -45.29678726196289, "logps/rejected": -40.973663330078125, "loss": 0.5043, "rewards/accuracies": 1.0, "rewards/chosen": 2.838937759399414, "rewards/margins": 0.6179561614990234, "rewards/rejected": 2.2209815979003906, "step": 8224 }, { "epoch": 1.82, "learning_rate": 2.0980500638493607e-07, "logits/chosen": -1.8837252855300903, "logits/rejected": -1.8254073858261108, "logps/chosen": -29.166603088378906, "logps/rejected": -8.419286727905273, "loss": 0.1521, "rewards/accuracies": 1.0, "rewards/chosen": 3.0049164295196533, "rewards/margins": 2.1087841987609863, "rewards/rejected": 0.8961321115493774, "step": 8225 }, { "epoch": 1.82, "learning_rate": 2.0929156503155357e-07, "logits/chosen": -2.1972389221191406, "logits/rejected": -2.206883430480957, "logps/chosen": -98.8687744140625, "logps/rejected": -79.53018188476562, "loss": 0.4617, "rewards/accuracies": 0.0, "rewards/chosen": 6.560464382171631, "rewards/margins": -0.36721372604370117, "rewards/rejected": 6.927678108215332, "step": 8226 }, { "epoch": 1.82, "learning_rate": 2.0877873927268865e-07, "logits/chosen": -2.0547139644622803, "logits/rejected": -2.0617897510528564, "logps/chosen": -68.26356506347656, "logps/rejected": -63.83758544921875, "loss": 0.0841, "rewards/accuracies": 1.0, "rewards/chosen": 4.361236095428467, "rewards/margins": 1.852637767791748, "rewards/rejected": 2.5085983276367188, "step": 8227 }, { "epoch": 1.82, "learning_rate": 2.0826652917423807e-07, "logits/chosen": -1.8861478567123413, "logits/rejected": -1.8444533348083496, "logps/chosen": -121.93113708496094, "logps/rejected": -68.47198486328125, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 7.674421787261963, "rewards/margins": 4.395791053771973, "rewards/rejected": 3.2786309719085693, "step": 8228 }, { "epoch": 1.82, "learning_rate": 2.0775493480201968e-07, "logits/chosen": -2.2782580852508545, "logits/rejected": -2.284078598022461, "logps/chosen": -27.306686401367188, "logps/rejected": -42.33568572998047, "loss": 0.4085, "rewards/accuracies": 1.0, "rewards/chosen": 3.716695547103882, "rewards/margins": 0.2778778076171875, "rewards/rejected": 3.4388177394866943, "step": 8229 }, { "epoch": 1.82, "learning_rate": 2.0724395622177151e-07, "logits/chosen": -2.113799810409546, "logits/rejected": -1.9411510229110718, "logps/chosen": -133.28607177734375, "logps/rejected": -20.127111434936523, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 8.037315368652344, "rewards/margins": 6.8137359619140625, "rewards/rejected": 1.2235796451568604, "step": 8230 }, { "epoch": 1.82, "learning_rate": 2.0673359349915433e-07, "logits/chosen": -2.3121705055236816, "logits/rejected": -2.2574222087860107, "logps/chosen": -136.55484008789062, "logps/rejected": -62.5596923828125, "loss": 0.0876, "rewards/accuracies": 1.0, "rewards/chosen": 4.754570007324219, "rewards/margins": 1.9202361106872559, "rewards/rejected": 2.834333896636963, "step": 8231 }, { "epoch": 1.82, "learning_rate": 2.0622384669974681e-07, "logits/chosen": -1.8714922666549683, "logits/rejected": -1.894197702407837, "logps/chosen": -54.00713348388672, "logps/rejected": -66.29302978515625, "loss": 0.2107, "rewards/accuracies": 1.0, "rewards/chosen": 4.485715389251709, "rewards/margins": 1.324275016784668, "rewards/rejected": 3.161440372467041, "step": 8232 }, { "epoch": 1.82, "learning_rate": 2.05714715889051e-07, "logits/chosen": -1.9355907440185547, "logits/rejected": -1.9355907440185547, "logps/chosen": -26.939552307128906, "logps/rejected": -26.939552307128906, "loss": 0.3482, "rewards/accuracies": 0.0, "rewards/chosen": 1.7702003717422485, "rewards/margins": 0.0, "rewards/rejected": 1.7702003717422485, "step": 8233 }, { "epoch": 1.82, "learning_rate": 2.0520620113248956e-07, "logits/chosen": -1.7765071392059326, "logits/rejected": -1.7765071392059326, "logps/chosen": -47.197731018066406, "logps/rejected": -47.197731018066406, "loss": 0.3983, "rewards/accuracies": 0.0, "rewards/chosen": 4.063680171966553, "rewards/margins": 0.0, "rewards/rejected": 4.063680171966553, "step": 8234 }, { "epoch": 1.82, "learning_rate": 2.0469830249540467e-07, "logits/chosen": -1.9569826126098633, "logits/rejected": -2.011883497238159, "logps/chosen": -39.29861068725586, "logps/rejected": -170.1207733154297, "loss": 0.5054, "rewards/accuracies": 0.0, "rewards/chosen": 5.344142436981201, "rewards/margins": -0.5579938888549805, "rewards/rejected": 5.902136325836182, "step": 8235 }, { "epoch": 1.82, "learning_rate": 2.0419102004306024e-07, "logits/chosen": -2.053237199783325, "logits/rejected": -2.019493818283081, "logps/chosen": -151.81814575195312, "logps/rejected": -92.69210052490234, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 10.71048641204834, "rewards/margins": 4.378291130065918, "rewards/rejected": 6.332195281982422, "step": 8236 }, { "epoch": 1.82, "learning_rate": 2.0368435384064078e-07, "logits/chosen": -1.8460698127746582, "logits/rejected": -1.846427321434021, "logps/chosen": -59.15906524658203, "logps/rejected": -154.60516357421875, "loss": 0.4711, "rewards/accuracies": 0.0, "rewards/chosen": 7.737923622131348, "rewards/margins": -0.16224908828735352, "rewards/rejected": 7.900172710418701, "step": 8237 }, { "epoch": 1.82, "learning_rate": 2.0317830395325255e-07, "logits/chosen": -1.937140703201294, "logits/rejected": -1.8754504919052124, "logps/chosen": -39.0480842590332, "logps/rejected": -8.175712585449219, "loss": 1.0743, "rewards/accuracies": 1.0, "rewards/chosen": 3.229526996612549, "rewards/margins": 1.8703840970993042, "rewards/rejected": 1.3591428995132446, "step": 8238 }, { "epoch": 1.82, "learning_rate": 2.026728704459202e-07, "logits/chosen": -1.9285765886306763, "logits/rejected": -1.8792636394500732, "logps/chosen": -36.82386779785156, "logps/rejected": -67.98164367675781, "loss": 0.1828, "rewards/accuracies": 1.0, "rewards/chosen": 3.242211103439331, "rewards/margins": 1.4392386674880981, "rewards/rejected": 1.802972435951233, "step": 8239 }, { "epoch": 1.82, "learning_rate": 2.0216805338359236e-07, "logits/chosen": -2.1116816997528076, "logits/rejected": -2.0883777141571045, "logps/chosen": -54.08271408081055, "logps/rejected": -25.869604110717773, "loss": 0.5871, "rewards/accuracies": 1.0, "rewards/chosen": 3.4055163860321045, "rewards/margins": 0.9340894222259521, "rewards/rejected": 2.4714269638061523, "step": 8240 }, { "epoch": 1.82, "learning_rate": 2.0166385283113487e-07, "logits/chosen": -1.82565176486969, "logits/rejected": -1.8247411251068115, "logps/chosen": -38.43779754638672, "logps/rejected": -64.45347595214844, "loss": 0.3762, "rewards/accuracies": 1.0, "rewards/chosen": 3.572063446044922, "rewards/margins": 0.08238053321838379, "rewards/rejected": 3.489682912826538, "step": 8241 }, { "epoch": 1.82, "learning_rate": 2.011602688533387e-07, "logits/chosen": -1.8853174448013306, "logits/rejected": -1.9171370267868042, "logps/chosen": -32.47159194946289, "logps/rejected": -55.56949996948242, "loss": 1.0783, "rewards/accuracies": 0.0, "rewards/chosen": 2.708662509918213, "rewards/margins": -2.0032949447631836, "rewards/rejected": 4.7119574546813965, "step": 8242 }, { "epoch": 1.82, "learning_rate": 2.006573015149116e-07, "logits/chosen": -1.9932020902633667, "logits/rejected": -2.0183656215667725, "logps/chosen": -45.007110595703125, "logps/rejected": -55.90423583984375, "loss": 0.0926, "rewards/accuracies": 1.0, "rewards/chosen": 4.316506385803223, "rewards/margins": 1.676217794418335, "rewards/rejected": 2.6402885913848877, "step": 8243 }, { "epoch": 1.82, "learning_rate": 2.0015495088048343e-07, "logits/chosen": -1.816009759902954, "logits/rejected": -1.819129467010498, "logps/chosen": -53.04656982421875, "logps/rejected": -52.594200134277344, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": 3.2191810607910156, "rewards/margins": 1.5782928466796875, "rewards/rejected": 1.6408882141113281, "step": 8244 }, { "epoch": 1.82, "learning_rate": 1.9965321701460538e-07, "logits/chosen": -1.8423144817352295, "logits/rejected": -1.791700005531311, "logps/chosen": -51.949275970458984, "logps/rejected": -38.3668098449707, "loss": 0.1309, "rewards/accuracies": 1.0, "rewards/chosen": 3.8289875984191895, "rewards/margins": 1.4862487316131592, "rewards/rejected": 2.3427388668060303, "step": 8245 }, { "epoch": 1.83, "learning_rate": 1.9915209998174978e-07, "logits/chosen": -1.8158507347106934, "logits/rejected": -1.8447941541671753, "logps/chosen": -23.476802825927734, "logps/rejected": -116.62344360351562, "loss": 0.5821, "rewards/accuracies": 0.0, "rewards/chosen": 4.629356861114502, "rewards/margins": -0.6147012710571289, "rewards/rejected": 5.244058132171631, "step": 8246 }, { "epoch": 1.83, "learning_rate": 1.9865159984630844e-07, "logits/chosen": -1.819836974143982, "logits/rejected": -1.819836974143982, "logps/chosen": -61.57915496826172, "logps/rejected": -61.57915496826172, "loss": 0.427, "rewards/accuracies": 0.0, "rewards/chosen": 5.023353576660156, "rewards/margins": 0.0, "rewards/rejected": 5.023353576660156, "step": 8247 }, { "epoch": 1.83, "learning_rate": 1.981517166725938e-07, "logits/chosen": -1.9599878787994385, "logits/rejected": -1.812103271484375, "logps/chosen": -77.74649810791016, "logps/rejected": -11.911365509033203, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": 4.461612701416016, "rewards/margins": 3.049689769744873, "rewards/rejected": 1.4119230508804321, "step": 8248 }, { "epoch": 1.83, "learning_rate": 1.9765245052484117e-07, "logits/chosen": -1.7214748859405518, "logits/rejected": -1.6280328035354614, "logps/chosen": -89.11528015136719, "logps/rejected": -50.757564544677734, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 5.705211162567139, "rewards/margins": 4.0044050216674805, "rewards/rejected": 1.7008060216903687, "step": 8249 }, { "epoch": 1.83, "learning_rate": 1.9715380146720308e-07, "logits/chosen": -1.8011209964752197, "logits/rejected": -1.934519648551941, "logps/chosen": -59.476383209228516, "logps/rejected": -245.74635314941406, "loss": 0.7009, "rewards/accuracies": 0.0, "rewards/chosen": 10.290143013000488, "rewards/margins": -1.088200569152832, "rewards/rejected": 11.37834358215332, "step": 8250 }, { "epoch": 1.83, "learning_rate": 1.9665576956375664e-07, "logits/chosen": -1.7487497329711914, "logits/rejected": -1.6353918313980103, "logps/chosen": -69.99937438964844, "logps/rejected": -36.15120315551758, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": 4.02212381362915, "rewards/margins": 2.151015520095825, "rewards/rejected": 1.8711082935333252, "step": 8251 }, { "epoch": 1.83, "learning_rate": 1.9615835487849677e-07, "logits/chosen": -1.692874550819397, "logits/rejected": -1.7416727542877197, "logps/chosen": -43.59321975708008, "logps/rejected": -81.31294250488281, "loss": 1.7151, "rewards/accuracies": 0.0, "rewards/chosen": 4.98270845413208, "rewards/margins": -3.3617730140686035, "rewards/rejected": 8.344481468200684, "step": 8252 }, { "epoch": 1.83, "learning_rate": 1.956615574753401e-07, "logits/chosen": -1.9919294118881226, "logits/rejected": -2.0234782695770264, "logps/chosen": -41.779541015625, "logps/rejected": -86.82233428955078, "loss": 0.2806, "rewards/accuracies": 1.0, "rewards/chosen": 3.521536350250244, "rewards/margins": 0.2942314147949219, "rewards/rejected": 3.2273049354553223, "step": 8253 }, { "epoch": 1.83, "learning_rate": 1.9516537741812337e-07, "logits/chosen": -1.9683729410171509, "logits/rejected": -1.875213861465454, "logps/chosen": -33.70851516723633, "logps/rejected": -5.525436878204346, "loss": 0.192, "rewards/accuracies": 1.0, "rewards/chosen": 2.7412688732147217, "rewards/margins": 1.9161040782928467, "rewards/rejected": 0.825164794921875, "step": 8254 }, { "epoch": 1.83, "learning_rate": 1.9466981477060553e-07, "logits/chosen": -1.8004292249679565, "logits/rejected": -1.7980372905731201, "logps/chosen": -44.03794479370117, "logps/rejected": -52.56000518798828, "loss": 0.2184, "rewards/accuracies": 1.0, "rewards/chosen": 3.4654293060302734, "rewards/margins": 0.7953174114227295, "rewards/rejected": 2.670111894607544, "step": 8255 }, { "epoch": 1.83, "learning_rate": 1.9417486959646515e-07, "logits/chosen": -2.0332083702087402, "logits/rejected": -1.7162995338439941, "logps/chosen": -160.6214599609375, "logps/rejected": -40.1451301574707, "loss": 0.095, "rewards/accuracies": 1.0, "rewards/chosen": 6.772674560546875, "rewards/margins": 4.080990791320801, "rewards/rejected": 2.691683530807495, "step": 8256 }, { "epoch": 1.83, "learning_rate": 1.9368054195930076e-07, "logits/chosen": -1.8120023012161255, "logits/rejected": -1.7388988733291626, "logps/chosen": -42.05975341796875, "logps/rejected": -10.031094551086426, "loss": 0.1352, "rewards/accuracies": 1.0, "rewards/chosen": 2.6874330043792725, "rewards/margins": 2.2648346424102783, "rewards/rejected": 0.42259836196899414, "step": 8257 }, { "epoch": 1.83, "learning_rate": 1.9318683192263322e-07, "logits/chosen": -1.7117600440979004, "logits/rejected": -1.742027759552002, "logps/chosen": -49.676422119140625, "logps/rejected": -152.60302734375, "loss": 2.2187, "rewards/accuracies": 0.0, "rewards/chosen": 5.371824741363525, "rewards/margins": -4.396984577178955, "rewards/rejected": 9.76880931854248, "step": 8258 }, { "epoch": 1.83, "learning_rate": 1.9269373954990178e-07, "logits/chosen": -1.8986263275146484, "logits/rejected": -1.8986263275146484, "logps/chosen": -17.9658203125, "logps/rejected": -17.9658203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.9017266035079956, "rewards/margins": 0.0, "rewards/rejected": 1.9017266035079956, "step": 8259 }, { "epoch": 1.83, "learning_rate": 1.922012649044691e-07, "logits/chosen": -1.9813110828399658, "logits/rejected": -1.9924062490463257, "logps/chosen": -53.828330993652344, "logps/rejected": -75.49008178710938, "loss": 1.3216, "rewards/accuracies": 0.0, "rewards/chosen": 7.886682987213135, "rewards/margins": -2.5453543663024902, "rewards/rejected": 10.432037353515625, "step": 8260 }, { "epoch": 1.83, "learning_rate": 1.9170940804961512e-07, "logits/chosen": -2.164344072341919, "logits/rejected": -2.0651369094848633, "logps/chosen": -88.3479232788086, "logps/rejected": -21.44546890258789, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": 5.585999488830566, "rewards/margins": 4.688529014587402, "rewards/rejected": 0.8974702954292297, "step": 8261 }, { "epoch": 1.83, "learning_rate": 1.912181690485443e-07, "logits/chosen": -2.0495495796203613, "logits/rejected": -2.0049142837524414, "logps/chosen": -39.47190856933594, "logps/rejected": -40.999549865722656, "loss": 0.3453, "rewards/accuracies": 1.0, "rewards/chosen": 3.2201759815216064, "rewards/margins": 0.1702895164489746, "rewards/rejected": 3.049886465072632, "step": 8262 }, { "epoch": 1.83, "learning_rate": 1.9072754796437886e-07, "logits/chosen": -1.9608161449432373, "logits/rejected": -1.9098432064056396, "logps/chosen": -99.4564208984375, "logps/rejected": -69.44294738769531, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": 7.315097332000732, "rewards/margins": 2.0352067947387695, "rewards/rejected": 5.279890537261963, "step": 8263 }, { "epoch": 1.83, "learning_rate": 1.9023754486016177e-07, "logits/chosen": -1.7583404779434204, "logits/rejected": -1.7915093898773193, "logps/chosen": -34.59459686279297, "logps/rejected": -69.8843994140625, "loss": 1.7033, "rewards/accuracies": 0.0, "rewards/chosen": 3.6967475414276123, "rewards/margins": -2.3823726177215576, "rewards/rejected": 6.07912015914917, "step": 8264 }, { "epoch": 1.83, "learning_rate": 1.8974815979885875e-07, "logits/chosen": -1.4206491708755493, "logits/rejected": -1.3962030410766602, "logps/chosen": -32.3951416015625, "logps/rejected": -57.23994445800781, "loss": 0.3039, "rewards/accuracies": 1.0, "rewards/chosen": 3.8305015563964844, "rewards/margins": 0.31180644035339355, "rewards/rejected": 3.518695116043091, "step": 8265 }, { "epoch": 1.83, "learning_rate": 1.8925939284335225e-07, "logits/chosen": -1.800702691078186, "logits/rejected": -1.8081144094467163, "logps/chosen": -38.998958587646484, "logps/rejected": -31.684326171875, "loss": 0.3211, "rewards/accuracies": 1.0, "rewards/chosen": 3.6370441913604736, "rewards/margins": 0.20974302291870117, "rewards/rejected": 3.4273011684417725, "step": 8266 }, { "epoch": 1.83, "learning_rate": 1.8877124405645043e-07, "logits/chosen": -1.8671684265136719, "logits/rejected": -1.832515001296997, "logps/chosen": -62.93071365356445, "logps/rejected": -190.76759338378906, "loss": 0.4008, "rewards/accuracies": 1.0, "rewards/chosen": 8.205519676208496, "rewards/margins": 2.989358901977539, "rewards/rejected": 5.216160774230957, "step": 8267 }, { "epoch": 1.83, "learning_rate": 1.8828371350087637e-07, "logits/chosen": -1.917516827583313, "logits/rejected": -1.9185625314712524, "logps/chosen": -79.35350036621094, "logps/rejected": -68.08255767822266, "loss": 0.1716, "rewards/accuracies": 1.0, "rewards/chosen": 7.025834560394287, "rewards/margins": 0.9216666221618652, "rewards/rejected": 6.104167938232422, "step": 8268 }, { "epoch": 1.83, "learning_rate": 1.8779680123927947e-07, "logits/chosen": -1.823610544204712, "logits/rejected": -1.823610544204712, "logps/chosen": -37.009613037109375, "logps/rejected": -37.009613037109375, "loss": 0.3472, "rewards/accuracies": 0.0, "rewards/chosen": 3.473468065261841, "rewards/margins": 0.0, "rewards/rejected": 3.473468065261841, "step": 8269 }, { "epoch": 1.83, "learning_rate": 1.8731050733422463e-07, "logits/chosen": -1.7773125171661377, "logits/rejected": -1.7564353942871094, "logps/chosen": -124.70950317382812, "logps/rejected": -98.6930160522461, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": 10.2367582321167, "rewards/margins": 3.007835865020752, "rewards/rejected": 7.228922367095947, "step": 8270 }, { "epoch": 1.83, "learning_rate": 1.8682483184820077e-07, "logits/chosen": -1.9954179525375366, "logits/rejected": -2.0131633281707764, "logps/chosen": -49.669525146484375, "logps/rejected": -92.89405822753906, "loss": 0.3653, "rewards/accuracies": 1.0, "rewards/chosen": 3.383990526199341, "rewards/margins": 0.049251556396484375, "rewards/rejected": 3.3347389698028564, "step": 8271 }, { "epoch": 1.83, "learning_rate": 1.863397748436152e-07, "logits/chosen": -1.8482900857925415, "logits/rejected": -1.7109311819076538, "logps/chosen": -105.79341125488281, "logps/rejected": -49.659751892089844, "loss": 0.368, "rewards/accuracies": 1.0, "rewards/chosen": 5.822509765625, "rewards/margins": 3.1295676231384277, "rewards/rejected": 2.6929421424865723, "step": 8272 }, { "epoch": 1.83, "learning_rate": 1.8585533638279695e-07, "logits/chosen": -1.9269757270812988, "logits/rejected": -1.8438583612442017, "logps/chosen": -70.00598907470703, "logps/rejected": -101.53849792480469, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": 5.385589599609375, "rewards/margins": 2.9072189331054688, "rewards/rejected": 2.4783706665039062, "step": 8273 }, { "epoch": 1.83, "learning_rate": 1.8537151652799512e-07, "logits/chosen": -1.75107741355896, "logits/rejected": -1.7091939449310303, "logps/chosen": -27.90961456298828, "logps/rejected": -43.11180877685547, "loss": 0.1886, "rewards/accuracies": 1.0, "rewards/chosen": 2.477522373199463, "rewards/margins": 0.9665192365646362, "rewards/rejected": 1.5110031366348267, "step": 8274 }, { "epoch": 1.83, "learning_rate": 1.8488831534137885e-07, "logits/chosen": -1.70707368850708, "logits/rejected": -1.7065125703811646, "logps/chosen": -75.79237365722656, "logps/rejected": -84.77571105957031, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 4.255481719970703, "rewards/margins": 1.5617187023162842, "rewards/rejected": 2.693763017654419, "step": 8275 }, { "epoch": 1.83, "learning_rate": 1.844057328850396e-07, "logits/chosen": -1.8630937337875366, "logits/rejected": -1.8651931285858154, "logps/chosen": -59.64069366455078, "logps/rejected": -87.46188354492188, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": 7.592705726623535, "rewards/margins": 3.625236749649048, "rewards/rejected": 3.9674689769744873, "step": 8276 }, { "epoch": 1.83, "learning_rate": 1.839237692209861e-07, "logits/chosen": -2.0004115104675293, "logits/rejected": -2.0151760578155518, "logps/chosen": -71.70893859863281, "logps/rejected": -78.29539489746094, "loss": 0.2757, "rewards/accuracies": 1.0, "rewards/chosen": 7.999803066253662, "rewards/margins": 1.5582027435302734, "rewards/rejected": 6.441600322723389, "step": 8277 }, { "epoch": 1.83, "learning_rate": 1.834424244111521e-07, "logits/chosen": -1.9449039697647095, "logits/rejected": -1.9871629476547241, "logps/chosen": -44.303680419921875, "logps/rejected": -64.82447814941406, "loss": 2.5442, "rewards/accuracies": 0.0, "rewards/chosen": 3.3242080211639404, "rewards/margins": -3.556466817855835, "rewards/rejected": 6.880674839019775, "step": 8278 }, { "epoch": 1.83, "learning_rate": 1.8296169851738655e-07, "logits/chosen": -1.946885108947754, "logits/rejected": -1.9110338687896729, "logps/chosen": -39.93058776855469, "logps/rejected": -65.220947265625, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": 4.2932329177856445, "rewards/margins": 2.619710683822632, "rewards/rejected": 1.6735222339630127, "step": 8279 }, { "epoch": 1.83, "learning_rate": 1.824815916014644e-07, "logits/chosen": -1.9532710313796997, "logits/rejected": -1.9742850065231323, "logps/chosen": -40.011817932128906, "logps/rejected": -69.17597198486328, "loss": 1.596, "rewards/accuracies": 0.0, "rewards/chosen": 4.3653411865234375, "rewards/margins": -2.6175284385681152, "rewards/rejected": 6.982869625091553, "step": 8280 }, { "epoch": 1.83, "learning_rate": 1.8200210372507576e-07, "logits/chosen": -1.9828211069107056, "logits/rejected": -1.9422568082809448, "logps/chosen": -40.91583251953125, "logps/rejected": -49.65007781982422, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": 4.177104473114014, "rewards/margins": 1.5420863628387451, "rewards/rejected": 2.6350181102752686, "step": 8281 }, { "epoch": 1.83, "learning_rate": 1.815232349498347e-07, "logits/chosen": -2.145193099975586, "logits/rejected": -2.1525514125823975, "logps/chosen": -24.054033279418945, "logps/rejected": -43.93836975097656, "loss": 0.2018, "rewards/accuracies": 1.0, "rewards/chosen": 4.568879127502441, "rewards/margins": 0.7460184097290039, "rewards/rejected": 3.8228607177734375, "step": 8282 }, { "epoch": 1.83, "learning_rate": 1.8104498533727588e-07, "logits/chosen": -2.033139228820801, "logits/rejected": -2.0292088985443115, "logps/chosen": -55.25071716308594, "logps/rejected": -86.86479187011719, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 8.533889770507812, "rewards/margins": 4.101978778839111, "rewards/rejected": 4.431910991668701, "step": 8283 }, { "epoch": 1.83, "learning_rate": 1.8056735494885014e-07, "logits/chosen": -1.5880972146987915, "logits/rejected": -1.5647947788238525, "logps/chosen": -3.391897678375244, "logps/rejected": -1.3163163661956787, "loss": 0.4498, "rewards/accuracies": 1.0, "rewards/chosen": 0.9613147974014282, "rewards/margins": 0.13403749465942383, "rewards/rejected": 0.8272773027420044, "step": 8284 }, { "epoch": 1.83, "learning_rate": 1.8009034384593505e-07, "logits/chosen": -1.9397045373916626, "logits/rejected": -1.8669657707214355, "logps/chosen": -60.694419860839844, "logps/rejected": -49.037723541259766, "loss": 0.1782, "rewards/accuracies": 1.0, "rewards/chosen": 4.741949558258057, "rewards/margins": 2.132852554321289, "rewards/rejected": 2.6090970039367676, "step": 8285 }, { "epoch": 1.83, "learning_rate": 1.7961395208982268e-07, "logits/chosen": -2.018929958343506, "logits/rejected": -1.9453693628311157, "logps/chosen": -102.1503677368164, "logps/rejected": -27.277433395385742, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 5.272368907928467, "rewards/margins": 4.2004075050354, "rewards/rejected": 1.0719614028930664, "step": 8286 }, { "epoch": 1.83, "learning_rate": 1.7913817974173075e-07, "logits/chosen": -2.05098032951355, "logits/rejected": -2.0271196365356445, "logps/chosen": -48.01565933227539, "logps/rejected": -71.71145629882812, "loss": 0.6689, "rewards/accuracies": 0.0, "rewards/chosen": 4.540392875671387, "rewards/margins": -1.001162052154541, "rewards/rejected": 5.541554927825928, "step": 8287 }, { "epoch": 1.83, "learning_rate": 1.786630268627937e-07, "logits/chosen": -1.7294743061065674, "logits/rejected": -1.7677816152572632, "logps/chosen": -25.13560676574707, "logps/rejected": -70.88316345214844, "loss": 0.9098, "rewards/accuracies": 1.0, "rewards/chosen": 3.3146889209747314, "rewards/margins": 0.5609931945800781, "rewards/rejected": 2.7536957263946533, "step": 8288 }, { "epoch": 1.83, "learning_rate": 1.781884935140671e-07, "logits/chosen": -2.0714101791381836, "logits/rejected": -2.017225980758667, "logps/chosen": -32.44582748413086, "logps/rejected": -22.475852966308594, "loss": 0.1548, "rewards/accuracies": 1.0, "rewards/chosen": 3.3150486946105957, "rewards/margins": 2.054865837097168, "rewards/rejected": 1.2601829767227173, "step": 8289 }, { "epoch": 1.83, "learning_rate": 1.7771457975652772e-07, "logits/chosen": -1.9633337259292603, "logits/rejected": -1.954669713973999, "logps/chosen": -50.27759552001953, "logps/rejected": -71.05450439453125, "loss": 0.1558, "rewards/accuracies": 1.0, "rewards/chosen": 5.544312953948975, "rewards/margins": 1.0604596138000488, "rewards/rejected": 4.483853340148926, "step": 8290 }, { "epoch": 1.84, "learning_rate": 1.7724128565107245e-07, "logits/chosen": -2.036409854888916, "logits/rejected": -1.941881775856018, "logps/chosen": -85.68213653564453, "logps/rejected": -37.40251159667969, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": 4.442458629608154, "rewards/margins": 3.780350685119629, "rewards/rejected": 0.6621078848838806, "step": 8291 }, { "epoch": 1.84, "learning_rate": 1.767686112585182e-07, "logits/chosen": -2.04014253616333, "logits/rejected": -2.043339252471924, "logps/chosen": -51.295745849609375, "logps/rejected": -62.58026123046875, "loss": 0.8568, "rewards/accuracies": 1.0, "rewards/chosen": 3.9310097694396973, "rewards/margins": 1.6259307861328125, "rewards/rejected": 2.3050789833068848, "step": 8292 }, { "epoch": 1.84, "learning_rate": 1.7629655663960298e-07, "logits/chosen": -2.2634975910186768, "logits/rejected": -2.249072790145874, "logps/chosen": -66.49363708496094, "logps/rejected": -50.584922790527344, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 8.623130798339844, "rewards/margins": 4.957223892211914, "rewards/rejected": 3.6659066677093506, "step": 8293 }, { "epoch": 1.84, "learning_rate": 1.7582512185498446e-07, "logits/chosen": -2.1058077812194824, "logits/rejected": -2.107048273086548, "logps/chosen": -65.76478576660156, "logps/rejected": -101.49170684814453, "loss": 0.3013, "rewards/accuracies": 1.0, "rewards/chosen": 9.132735252380371, "rewards/margins": 0.19140243530273438, "rewards/rejected": 8.941332817077637, "step": 8294 }, { "epoch": 1.84, "learning_rate": 1.7535430696524025e-07, "logits/chosen": -1.950955867767334, "logits/rejected": -1.96653413772583, "logps/chosen": -86.82750701904297, "logps/rejected": -73.12635803222656, "loss": 0.1115, "rewards/accuracies": 1.0, "rewards/chosen": 9.818037033081055, "rewards/margins": 1.5742826461791992, "rewards/rejected": 8.243754386901855, "step": 8295 }, { "epoch": 1.84, "learning_rate": 1.748841120308703e-07, "logits/chosen": -2.0482447147369385, "logits/rejected": -1.9575841426849365, "logps/chosen": -141.75888061523438, "logps/rejected": -42.5998420715332, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 8.534686088562012, "rewards/margins": 7.266757488250732, "rewards/rejected": 1.2679287195205688, "step": 8296 }, { "epoch": 1.84, "learning_rate": 1.7441453711229238e-07, "logits/chosen": -1.6554144620895386, "logits/rejected": -1.728065013885498, "logps/chosen": -30.10958480834961, "logps/rejected": -23.563261032104492, "loss": 0.254, "rewards/accuracies": 1.0, "rewards/chosen": 3.682121753692627, "rewards/margins": 1.163550853729248, "rewards/rejected": 2.518570899963379, "step": 8297 }, { "epoch": 1.84, "learning_rate": 1.739455822698455e-07, "logits/chosen": -2.1154141426086426, "logits/rejected": -2.044422149658203, "logps/chosen": -89.09768676757812, "logps/rejected": -51.33758544921875, "loss": 0.1094, "rewards/accuracies": 1.0, "rewards/chosen": 5.960517883300781, "rewards/margins": 2.0581107139587402, "rewards/rejected": 3.902407169342041, "step": 8298 }, { "epoch": 1.84, "learning_rate": 1.7347724756379025e-07, "logits/chosen": -1.9033018350601196, "logits/rejected": -1.8970690965652466, "logps/chosen": -28.095184326171875, "logps/rejected": -32.844852447509766, "loss": 0.2414, "rewards/accuracies": 1.0, "rewards/chosen": 3.548668622970581, "rewards/margins": 0.48743700981140137, "rewards/rejected": 3.0612316131591797, "step": 8299 }, { "epoch": 1.84, "learning_rate": 1.7300953305430578e-07, "logits/chosen": -1.8686630725860596, "logits/rejected": -1.864874005317688, "logps/chosen": -39.96354675292969, "logps/rejected": -44.676910400390625, "loss": 0.4043, "rewards/accuracies": 0.0, "rewards/chosen": 2.8957290649414062, "rewards/margins": -0.20900893211364746, "rewards/rejected": 3.1047379970550537, "step": 8300 }, { "epoch": 1.84, "learning_rate": 1.725424388014929e-07, "logits/chosen": -2.099557399749756, "logits/rejected": -2.028812885284424, "logps/chosen": -113.3817138671875, "logps/rejected": -35.45550537109375, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 5.986599922180176, "rewards/margins": 2.6786141395568848, "rewards/rejected": 3.307985782623291, "step": 8301 }, { "epoch": 1.84, "learning_rate": 1.7207596486537138e-07, "logits/chosen": -2.0686190128326416, "logits/rejected": -2.066457748413086, "logps/chosen": -45.717063903808594, "logps/rejected": -63.463748931884766, "loss": 0.2382, "rewards/accuracies": 1.0, "rewards/chosen": 5.499190807342529, "rewards/margins": 0.5882277488708496, "rewards/rejected": 4.91096305847168, "step": 8302 }, { "epoch": 1.84, "learning_rate": 1.7161011130588323e-07, "logits/chosen": -2.0018866062164307, "logits/rejected": -2.038428783416748, "logps/chosen": -35.65523147583008, "logps/rejected": -62.77471160888672, "loss": 0.348, "rewards/accuracies": 1.0, "rewards/chosen": 3.0877163410186768, "rewards/margins": 0.14276385307312012, "rewards/rejected": 2.9449524879455566, "step": 8303 }, { "epoch": 1.84, "learning_rate": 1.7114487818288727e-07, "logits/chosen": -1.5759031772613525, "logits/rejected": -1.6401339769363403, "logps/chosen": -19.92001724243164, "logps/rejected": -48.985496520996094, "loss": 0.9944, "rewards/accuracies": 0.0, "rewards/chosen": 1.5686718225479126, "rewards/margins": -1.8086766004562378, "rewards/rejected": 3.3773484230041504, "step": 8304 }, { "epoch": 1.84, "learning_rate": 1.7068026555616734e-07, "logits/chosen": -1.7086821794509888, "logits/rejected": -1.7973343133926392, "logps/chosen": -34.936405181884766, "logps/rejected": -49.81911087036133, "loss": 1.1458, "rewards/accuracies": 0.0, "rewards/chosen": 3.293854236602783, "rewards/margins": -2.1817879676818848, "rewards/rejected": 5.475642204284668, "step": 8305 }, { "epoch": 1.84, "learning_rate": 1.7021627348542347e-07, "logits/chosen": -1.7400753498077393, "logits/rejected": -1.7539408206939697, "logps/chosen": -30.457509994506836, "logps/rejected": -35.23515701293945, "loss": 0.4735, "rewards/accuracies": 1.0, "rewards/chosen": 4.762244701385498, "rewards/margins": 0.5299010276794434, "rewards/rejected": 4.232343673706055, "step": 8306 }, { "epoch": 1.84, "learning_rate": 1.6975290203027795e-07, "logits/chosen": -1.6045783758163452, "logits/rejected": -1.565615177154541, "logps/chosen": -35.8336181640625, "logps/rejected": -42.60991668701172, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 3.1350371837615967, "rewards/margins": 0.6192693710327148, "rewards/rejected": 2.515767812728882, "step": 8307 }, { "epoch": 1.84, "learning_rate": 1.6929015125027314e-07, "logits/chosen": -1.8694710731506348, "logits/rejected": -1.6495509147644043, "logps/chosen": -127.30101013183594, "logps/rejected": -35.204566955566406, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 7.189326763153076, "rewards/margins": 6.700040340423584, "rewards/rejected": 0.4892864227294922, "step": 8308 }, { "epoch": 1.84, "learning_rate": 1.6882802120487096e-07, "logits/chosen": -1.770827054977417, "logits/rejected": -1.715248465538025, "logps/chosen": -30.642837524414062, "logps/rejected": -68.6693115234375, "loss": 0.4913, "rewards/accuracies": 0.0, "rewards/chosen": 3.687260389328003, "rewards/margins": -0.07713699340820312, "rewards/rejected": 3.764397382736206, "step": 8309 }, { "epoch": 1.84, "learning_rate": 1.6836651195345445e-07, "logits/chosen": -2.1927249431610107, "logits/rejected": -2.1681206226348877, "logps/chosen": -26.12529182434082, "logps/rejected": -56.96388244628906, "loss": 0.1768, "rewards/accuracies": 1.0, "rewards/chosen": 3.280134677886963, "rewards/margins": 0.8699951171875, "rewards/rejected": 2.410139560699463, "step": 8310 }, { "epoch": 1.84, "learning_rate": 1.6790562355532613e-07, "logits/chosen": -1.5806818008422852, "logits/rejected": -1.5483825206756592, "logps/chosen": -22.451602935791016, "logps/rejected": -30.953575134277344, "loss": 0.7473, "rewards/accuracies": 0.0, "rewards/chosen": 2.3926472663879395, "rewards/margins": -1.2365634441375732, "rewards/rejected": 3.6292107105255127, "step": 8311 }, { "epoch": 1.84, "learning_rate": 1.6744535606970925e-07, "logits/chosen": -1.739018201828003, "logits/rejected": -1.739018201828003, "logps/chosen": -34.82914352416992, "logps/rejected": -34.82914352416992, "loss": 0.7313, "rewards/accuracies": 0.0, "rewards/chosen": 3.8902204036712646, "rewards/margins": 0.0, "rewards/rejected": 3.8902204036712646, "step": 8312 }, { "epoch": 1.84, "learning_rate": 1.6698570955574646e-07, "logits/chosen": -1.8048369884490967, "logits/rejected": -1.808915376663208, "logps/chosen": -54.46308898925781, "logps/rejected": -65.287353515625, "loss": 0.1873, "rewards/accuracies": 1.0, "rewards/chosen": 4.8389434814453125, "rewards/margins": 0.9507536888122559, "rewards/rejected": 3.8881897926330566, "step": 8313 }, { "epoch": 1.84, "learning_rate": 1.6652668407250272e-07, "logits/chosen": -1.8319071531295776, "logits/rejected": -1.7705936431884766, "logps/chosen": -94.30140686035156, "logps/rejected": -63.42837905883789, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": 5.335978984832764, "rewards/margins": 2.0319631099700928, "rewards/rejected": 3.304015874862671, "step": 8314 }, { "epoch": 1.84, "learning_rate": 1.6606827967896034e-07, "logits/chosen": -1.7528353929519653, "logits/rejected": -1.7252354621887207, "logps/chosen": -41.02954864501953, "logps/rejected": -61.264766693115234, "loss": 0.2326, "rewards/accuracies": 1.0, "rewards/chosen": 4.345179080963135, "rewards/margins": 0.5377826690673828, "rewards/rejected": 3.807396411895752, "step": 8315 }, { "epoch": 1.84, "learning_rate": 1.6561049643402327e-07, "logits/chosen": -1.9173574447631836, "logits/rejected": -1.8830668926239014, "logps/chosen": -44.94816970825195, "logps/rejected": -56.63048553466797, "loss": 0.1223, "rewards/accuracies": 1.0, "rewards/chosen": 2.3464183807373047, "rewards/margins": 1.3222408294677734, "rewards/rejected": 1.0241775512695312, "step": 8316 }, { "epoch": 1.84, "learning_rate": 1.651533343965156e-07, "logits/chosen": -1.99739408493042, "logits/rejected": -1.8817143440246582, "logps/chosen": -118.69541931152344, "logps/rejected": -35.14375686645508, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 6.783970832824707, "rewards/margins": 3.9065136909484863, "rewards/rejected": 2.8774571418762207, "step": 8317 }, { "epoch": 1.84, "learning_rate": 1.6469679362518255e-07, "logits/chosen": -1.6076879501342773, "logits/rejected": -1.5907124280929565, "logps/chosen": -30.40682601928711, "logps/rejected": -53.15776062011719, "loss": 0.0961, "rewards/accuracies": 1.0, "rewards/chosen": 2.489471197128296, "rewards/margins": 1.7317109107971191, "rewards/rejected": 0.757760226726532, "step": 8318 }, { "epoch": 1.84, "learning_rate": 1.6424087417868717e-07, "logits/chosen": -2.1307356357574463, "logits/rejected": -2.129629135131836, "logps/chosen": -18.19037628173828, "logps/rejected": -36.754356384277344, "loss": 0.581, "rewards/accuracies": 0.0, "rewards/chosen": 3.4950897693634033, "rewards/margins": -0.3038649559020996, "rewards/rejected": 3.798954725265503, "step": 8319 }, { "epoch": 1.84, "learning_rate": 1.6378557611561484e-07, "logits/chosen": -1.8208551406860352, "logits/rejected": -1.8914259672164917, "logps/chosen": -47.03144073486328, "logps/rejected": -126.41911315917969, "loss": 1.1748, "rewards/accuracies": 0.0, "rewards/chosen": 7.336152076721191, "rewards/margins": -2.244800567626953, "rewards/rejected": 9.580952644348145, "step": 8320 }, { "epoch": 1.84, "learning_rate": 1.633308994944699e-07, "logits/chosen": -1.728309154510498, "logits/rejected": -1.682726263999939, "logps/chosen": -45.45301818847656, "logps/rejected": -40.18441390991211, "loss": 0.1546, "rewards/accuracies": 1.0, "rewards/chosen": 4.566264629364014, "rewards/margins": 1.1941022872924805, "rewards/rejected": 3.372162342071533, "step": 8321 }, { "epoch": 1.84, "learning_rate": 1.6287684437367724e-07, "logits/chosen": -1.8363994359970093, "logits/rejected": -1.8204222917556763, "logps/chosen": -34.080101013183594, "logps/rejected": -57.19111633300781, "loss": 0.4046, "rewards/accuracies": 0.0, "rewards/chosen": 3.943631887435913, "rewards/margins": -0.19667744636535645, "rewards/rejected": 4.1403093338012695, "step": 8322 }, { "epoch": 1.84, "learning_rate": 1.6242341081158298e-07, "logits/chosen": -1.780091643333435, "logits/rejected": -1.8655779361724854, "logps/chosen": -24.64600372314453, "logps/rejected": -92.12815856933594, "loss": 0.9741, "rewards/accuracies": 0.0, "rewards/chosen": 2.5704009532928467, "rewards/margins": -1.6525070667266846, "rewards/rejected": 4.222908020019531, "step": 8323 }, { "epoch": 1.84, "learning_rate": 1.619705988664505e-07, "logits/chosen": -2.1487057209014893, "logits/rejected": -2.1589434146881104, "logps/chosen": -38.48963928222656, "logps/rejected": -62.705589294433594, "loss": 0.3027, "rewards/accuracies": 1.0, "rewards/chosen": 4.285444736480713, "rewards/margins": 0.9509742259979248, "rewards/rejected": 3.334470510482788, "step": 8324 }, { "epoch": 1.84, "learning_rate": 1.6151840859646552e-07, "logits/chosen": -1.746385931968689, "logits/rejected": -1.746385931968689, "logps/chosen": -61.360225677490234, "logps/rejected": -61.360225677490234, "loss": 0.5644, "rewards/accuracies": 0.0, "rewards/chosen": 2.1337344646453857, "rewards/margins": 0.0, "rewards/rejected": 2.1337344646453857, "step": 8325 }, { "epoch": 1.84, "learning_rate": 1.6106684005973371e-07, "logits/chosen": -1.8699300289154053, "logits/rejected": -1.791634202003479, "logps/chosen": -149.6273193359375, "logps/rejected": -26.844223022460938, "loss": 0.3574, "rewards/accuracies": 1.0, "rewards/chosen": 4.748959541320801, "rewards/margins": 3.8236963748931885, "rewards/rejected": 0.9252632260322571, "step": 8326 }, { "epoch": 1.84, "learning_rate": 1.6061589331428096e-07, "logits/chosen": -1.816823959350586, "logits/rejected": -1.816823959350586, "logps/chosen": -15.927600860595703, "logps/rejected": -15.927600860595703, "loss": 0.3483, "rewards/accuracies": 0.0, "rewards/chosen": 4.055898666381836, "rewards/margins": 0.0, "rewards/rejected": 4.055898666381836, "step": 8327 }, { "epoch": 1.84, "learning_rate": 1.6016556841805142e-07, "logits/chosen": -1.9791948795318604, "logits/rejected": -1.8597676753997803, "logps/chosen": -116.81770324707031, "logps/rejected": -68.0801773071289, "loss": 0.1232, "rewards/accuracies": 1.0, "rewards/chosen": 5.304805278778076, "rewards/margins": 1.3310267925262451, "rewards/rejected": 3.973778486251831, "step": 8328 }, { "epoch": 1.84, "learning_rate": 1.5971586542891272e-07, "logits/chosen": -1.8857086896896362, "logits/rejected": -1.8024171590805054, "logps/chosen": -49.04079818725586, "logps/rejected": -9.237995147705078, "loss": 0.0923, "rewards/accuracies": 1.0, "rewards/chosen": 4.104712963104248, "rewards/margins": 3.4238250255584717, "rewards/rejected": 0.6808879971504211, "step": 8329 }, { "epoch": 1.84, "learning_rate": 1.5926678440464916e-07, "logits/chosen": -2.1482696533203125, "logits/rejected": -2.161221981048584, "logps/chosen": -50.41948318481445, "logps/rejected": -98.3807373046875, "loss": 0.714, "rewards/accuracies": 0.0, "rewards/chosen": 5.413593769073486, "rewards/margins": -1.1514058113098145, "rewards/rejected": 6.564999580383301, "step": 8330 }, { "epoch": 1.84, "learning_rate": 1.5881832540296682e-07, "logits/chosen": -1.8403030633926392, "logits/rejected": -1.8317383527755737, "logps/chosen": -37.28785705566406, "logps/rejected": -46.734519958496094, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": 4.729223728179932, "rewards/margins": 1.9518578052520752, "rewards/rejected": 2.7773659229278564, "step": 8331 }, { "epoch": 1.84, "learning_rate": 1.5837048848149183e-07, "logits/chosen": -1.7808068990707397, "logits/rejected": -1.6228885650634766, "logps/chosen": -152.90536499023438, "logps/rejected": -28.314998626708984, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": 4.731405735015869, "rewards/margins": 3.7958364486694336, "rewards/rejected": 0.9355694055557251, "step": 8332 }, { "epoch": 1.84, "learning_rate": 1.5792327369777093e-07, "logits/chosen": -1.866960048675537, "logits/rejected": -1.6774697303771973, "logps/chosen": -80.5767822265625, "logps/rejected": -11.043013572692871, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 3.900228977203369, "rewards/margins": 2.804982900619507, "rewards/rejected": 1.0952460765838623, "step": 8333 }, { "epoch": 1.84, "learning_rate": 1.5747668110926816e-07, "logits/chosen": -2.0217559337615967, "logits/rejected": -1.9114052057266235, "logps/chosen": -63.582706451416016, "logps/rejected": -17.739713668823242, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": 5.6795172691345215, "rewards/margins": 3.264303207397461, "rewards/rejected": 2.4152140617370605, "step": 8334 }, { "epoch": 1.84, "learning_rate": 1.570307107733715e-07, "logits/chosen": -2.1522281169891357, "logits/rejected": -2.174565315246582, "logps/chosen": -30.39525032043457, "logps/rejected": -137.15423583984375, "loss": 0.1696, "rewards/accuracies": 1.0, "rewards/chosen": 4.680490016937256, "rewards/margins": 1.3622586727142334, "rewards/rejected": 3.3182313442230225, "step": 8335 }, { "epoch": 1.85, "learning_rate": 1.5658536274738623e-07, "logits/chosen": -1.9932880401611328, "logits/rejected": -1.9506950378417969, "logps/chosen": -102.12521362304688, "logps/rejected": -47.91638946533203, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 5.520181179046631, "rewards/margins": 1.6449346542358398, "rewards/rejected": 3.875246524810791, "step": 8336 }, { "epoch": 1.85, "learning_rate": 1.561406370885382e-07, "logits/chosen": -2.0140676498413086, "logits/rejected": -1.9876960515975952, "logps/chosen": -44.92070007324219, "logps/rejected": -58.80123519897461, "loss": 0.1563, "rewards/accuracies": 1.0, "rewards/chosen": 4.7432990074157715, "rewards/margins": 1.1409876346588135, "rewards/rejected": 3.602311372756958, "step": 8337 }, { "epoch": 1.85, "learning_rate": 1.5569653385397398e-07, "logits/chosen": -1.9569129943847656, "logits/rejected": -1.9802874326705933, "logps/chosen": -57.85442352294922, "logps/rejected": -112.19572448730469, "loss": 0.5561, "rewards/accuracies": 1.0, "rewards/chosen": 5.286141395568848, "rewards/margins": 0.8580498695373535, "rewards/rejected": 4.428091526031494, "step": 8338 }, { "epoch": 1.85, "learning_rate": 1.552530531007601e-07, "logits/chosen": -1.9221571683883667, "logits/rejected": -1.8787328004837036, "logps/chosen": -55.50406265258789, "logps/rejected": -29.85930061340332, "loss": 0.1592, "rewards/accuracies": 1.0, "rewards/chosen": 3.584653854370117, "rewards/margins": 0.9832756519317627, "rewards/rejected": 2.6013782024383545, "step": 8339 }, { "epoch": 1.85, "learning_rate": 1.5481019488588324e-07, "logits/chosen": -1.8442022800445557, "logits/rejected": -1.816457986831665, "logps/chosen": -28.772193908691406, "logps/rejected": -35.93422317504883, "loss": 0.5765, "rewards/accuracies": 1.0, "rewards/chosen": 4.664221286773682, "rewards/margins": 0.599449634552002, "rewards/rejected": 4.06477165222168, "step": 8340 }, { "epoch": 1.85, "learning_rate": 1.5436795926624726e-07, "logits/chosen": -1.7921441793441772, "logits/rejected": -1.8280646800994873, "logps/chosen": -103.9120101928711, "logps/rejected": -50.34751892089844, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 5.631927013397217, "rewards/margins": 2.4944963455200195, "rewards/rejected": 3.1374306678771973, "step": 8341 }, { "epoch": 1.85, "learning_rate": 1.539263462986812e-07, "logits/chosen": -1.772384762763977, "logits/rejected": -1.7522939443588257, "logps/chosen": -39.38410949707031, "logps/rejected": -97.43291473388672, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 6.508754253387451, "rewards/margins": 3.071714162826538, "rewards/rejected": 3.437040090560913, "step": 8342 }, { "epoch": 1.85, "learning_rate": 1.5348535603992855e-07, "logits/chosen": -1.7477370500564575, "logits/rejected": -1.727462649345398, "logps/chosen": -37.09877014160156, "logps/rejected": -34.011070251464844, "loss": 0.5173, "rewards/accuracies": 1.0, "rewards/chosen": 2.853125810623169, "rewards/margins": 0.25073671340942383, "rewards/rejected": 2.602389097213745, "step": 8343 }, { "epoch": 1.85, "learning_rate": 1.530449885466584e-07, "logits/chosen": -1.7755486965179443, "logits/rejected": -1.7755486965179443, "logps/chosen": -59.95240020751953, "logps/rejected": -59.95240020751953, "loss": 0.3467, "rewards/accuracies": 0.0, "rewards/chosen": 6.754120826721191, "rewards/margins": 0.0, "rewards/rejected": 6.754120826721191, "step": 8344 }, { "epoch": 1.85, "learning_rate": 1.5260524387545494e-07, "logits/chosen": -2.020977735519409, "logits/rejected": -1.9329966306686401, "logps/chosen": -64.86354064941406, "logps/rejected": -20.65372085571289, "loss": 0.1204, "rewards/accuracies": 1.0, "rewards/chosen": 2.3620986938476562, "rewards/margins": 1.6770652532577515, "rewards/rejected": 0.6850334405899048, "step": 8345 }, { "epoch": 1.85, "learning_rate": 1.5216612208282466e-07, "logits/chosen": -1.9627677202224731, "logits/rejected": -1.9052494764328003, "logps/chosen": -71.52578735351562, "logps/rejected": -61.204959869384766, "loss": 0.1185, "rewards/accuracies": 1.0, "rewards/chosen": 3.6694374084472656, "rewards/margins": 2.248084545135498, "rewards/rejected": 1.421352744102478, "step": 8346 }, { "epoch": 1.85, "learning_rate": 1.517276232251941e-07, "logits/chosen": -1.9051201343536377, "logits/rejected": -1.900018572807312, "logps/chosen": -52.824913024902344, "logps/rejected": -61.31739807128906, "loss": 0.9625, "rewards/accuracies": 0.0, "rewards/chosen": 2.9725701808929443, "rewards/margins": -1.5334303379058838, "rewards/rejected": 4.506000518798828, "step": 8347 }, { "epoch": 1.85, "learning_rate": 1.5128974735890867e-07, "logits/chosen": -2.072139024734497, "logits/rejected": -1.9130319356918335, "logps/chosen": -128.9647674560547, "logps/rejected": -48.5478401184082, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": 6.812254428863525, "rewards/margins": 3.8031933307647705, "rewards/rejected": 3.009061098098755, "step": 8348 }, { "epoch": 1.85, "learning_rate": 1.5085249454023565e-07, "logits/chosen": -2.1605477333068848, "logits/rejected": -2.3476550579071045, "logps/chosen": -59.901397705078125, "logps/rejected": -94.61344909667969, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 3.8927018642425537, "rewards/margins": 3.1081674098968506, "rewards/rejected": 0.7845344543457031, "step": 8349 }, { "epoch": 1.85, "learning_rate": 1.504158648253584e-07, "logits/chosen": -1.8493242263793945, "logits/rejected": -1.2805850505828857, "logps/chosen": -37.0347900390625, "logps/rejected": -204.95640563964844, "loss": 1.4114, "rewards/accuracies": 0.0, "rewards/chosen": 6.866377353668213, "rewards/margins": -2.724377155303955, "rewards/rejected": 9.590754508972168, "step": 8350 }, { "epoch": 1.85, "learning_rate": 1.4997985827038598e-07, "logits/chosen": -2.2957615852355957, "logits/rejected": -2.2581052780151367, "logps/chosen": -62.8581428527832, "logps/rejected": -42.664390563964844, "loss": 0.0822, "rewards/accuracies": 1.0, "rewards/chosen": 6.3822431564331055, "rewards/margins": 3.563007116317749, "rewards/rejected": 2.8192360401153564, "step": 8351 }, { "epoch": 1.85, "learning_rate": 1.4954447493134182e-07, "logits/chosen": -2.880255699157715, "logits/rejected": -2.8504598140716553, "logps/chosen": -45.37786865234375, "logps/rejected": -100.39446258544922, "loss": 0.1926, "rewards/accuracies": 1.0, "rewards/chosen": 2.632824659347534, "rewards/margins": 0.7967971563339233, "rewards/rejected": 1.8360275030136108, "step": 8352 }, { "epoch": 1.85, "learning_rate": 1.491097148641729e-07, "logits/chosen": -1.8649154901504517, "logits/rejected": -1.6757818460464478, "logps/chosen": -86.40760040283203, "logps/rejected": -20.847673416137695, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 5.526191234588623, "rewards/margins": 2.4895992279052734, "rewards/rejected": 3.0365920066833496, "step": 8353 }, { "epoch": 1.85, "learning_rate": 1.4867557812474453e-07, "logits/chosen": -1.8601747751235962, "logits/rejected": -1.797282099723816, "logps/chosen": -39.603485107421875, "logps/rejected": -102.9238510131836, "loss": 0.0647, "rewards/accuracies": 1.0, "rewards/chosen": 3.9562766551971436, "rewards/margins": 2.0620827674865723, "rewards/rejected": 1.8941940069198608, "step": 8354 }, { "epoch": 1.85, "learning_rate": 1.4824206476884151e-07, "logits/chosen": -2.044896125793457, "logits/rejected": -2.0065760612487793, "logps/chosen": -109.6053466796875, "logps/rejected": -96.96039581298828, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 5.471982002258301, "rewards/margins": 3.627434730529785, "rewards/rejected": 1.8445472717285156, "step": 8355 }, { "epoch": 1.85, "learning_rate": 1.4780917485216982e-07, "logits/chosen": -1.7851539850234985, "logits/rejected": -1.7851539850234985, "logps/chosen": -46.59381866455078, "logps/rejected": -46.59381866455078, "loss": 0.3698, "rewards/accuracies": 0.0, "rewards/chosen": 3.9254066944122314, "rewards/margins": 0.0, "rewards/rejected": 3.9254066944122314, "step": 8356 }, { "epoch": 1.85, "learning_rate": 1.4737690843035557e-07, "logits/chosen": -1.9718225002288818, "logits/rejected": -1.9358253479003906, "logps/chosen": -65.18667602539062, "logps/rejected": -58.15272903442383, "loss": 0.0824, "rewards/accuracies": 1.0, "rewards/chosen": 5.64437198638916, "rewards/margins": 2.471754312515259, "rewards/rejected": 3.1726176738739014, "step": 8357 }, { "epoch": 1.85, "learning_rate": 1.4694526555894318e-07, "logits/chosen": -1.7216600179672241, "logits/rejected": -1.7285130023956299, "logps/chosen": -47.59336853027344, "logps/rejected": -64.69529724121094, "loss": 0.3383, "rewards/accuracies": 1.0, "rewards/chosen": 3.5638115406036377, "rewards/margins": 0.15503764152526855, "rewards/rejected": 3.408773899078369, "step": 8358 }, { "epoch": 1.85, "learning_rate": 1.465142462933966e-07, "logits/chosen": -1.976359248161316, "logits/rejected": -1.9479984045028687, "logps/chosen": -60.504940032958984, "logps/rejected": -87.74119567871094, "loss": 0.1547, "rewards/accuracies": 1.0, "rewards/chosen": 4.343901634216309, "rewards/margins": 1.4639012813568115, "rewards/rejected": 2.880000352859497, "step": 8359 }, { "epoch": 1.85, "learning_rate": 1.4608385068910324e-07, "logits/chosen": -1.5639126300811768, "logits/rejected": -1.4190558195114136, "logps/chosen": -63.399436950683594, "logps/rejected": -18.969497680664062, "loss": 0.0786, "rewards/accuracies": 1.0, "rewards/chosen": 2.493462324142456, "rewards/margins": 1.9201956987380981, "rewards/rejected": 0.5732666254043579, "step": 8360 }, { "epoch": 1.85, "learning_rate": 1.4565407880136552e-07, "logits/chosen": -1.7882249355316162, "logits/rejected": -1.6439895629882812, "logps/chosen": -88.0068359375, "logps/rejected": -22.812042236328125, "loss": 0.0728, "rewards/accuracies": 1.0, "rewards/chosen": 4.809657573699951, "rewards/margins": 4.231539249420166, "rewards/rejected": 0.5781185030937195, "step": 8361 }, { "epoch": 1.85, "learning_rate": 1.4522493068540976e-07, "logits/chosen": -1.9945534467697144, "logits/rejected": -1.8878811597824097, "logps/chosen": -137.16322326660156, "logps/rejected": -103.87396240234375, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": 6.452104091644287, "rewards/margins": 2.42588472366333, "rewards/rejected": 4.026219367980957, "step": 8362 }, { "epoch": 1.85, "learning_rate": 1.4479640639637972e-07, "logits/chosen": -2.0017879009246826, "logits/rejected": -1.9833040237426758, "logps/chosen": -43.832420349121094, "logps/rejected": -30.808998107910156, "loss": 0.3856, "rewards/accuracies": 1.0, "rewards/chosen": 3.463855743408203, "rewards/margins": 2.510744094848633, "rewards/rejected": 0.9531116485595703, "step": 8363 }, { "epoch": 1.85, "learning_rate": 1.443685059893396e-07, "logits/chosen": -1.9936697483062744, "logits/rejected": -1.8173154592514038, "logps/chosen": -153.83682250976562, "logps/rejected": -42.38412094116211, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": 6.755362033843994, "rewards/margins": 4.40963077545166, "rewards/rejected": 2.345731019973755, "step": 8364 }, { "epoch": 1.85, "learning_rate": 1.4394122951927436e-07, "logits/chosen": -1.9044076204299927, "logits/rejected": -1.3495378494262695, "logps/chosen": -67.7811279296875, "logps/rejected": -130.93026733398438, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": 9.42710018157959, "rewards/margins": 1.3648109436035156, "rewards/rejected": 8.062289237976074, "step": 8365 }, { "epoch": 1.85, "learning_rate": 1.435145770410873e-07, "logits/chosen": -1.9184612035751343, "logits/rejected": -1.7633470296859741, "logps/chosen": -51.3035888671875, "logps/rejected": -24.479251861572266, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": 3.0167770385742188, "rewards/margins": 1.4477447271347046, "rewards/rejected": 1.5690323114395142, "step": 8366 }, { "epoch": 1.85, "learning_rate": 1.4308854860960286e-07, "logits/chosen": -1.9395310878753662, "logits/rejected": -1.7877247333526611, "logps/chosen": -83.6866226196289, "logps/rejected": -13.04566764831543, "loss": 0.4333, "rewards/accuracies": 1.0, "rewards/chosen": 3.4889564514160156, "rewards/margins": 2.9896883964538574, "rewards/rejected": 0.49926814436912537, "step": 8367 }, { "epoch": 1.85, "learning_rate": 1.4266314427956286e-07, "logits/chosen": -2.017669439315796, "logits/rejected": -1.9545031785964966, "logps/chosen": -64.53466796875, "logps/rejected": -26.544721603393555, "loss": 0.1693, "rewards/accuracies": 1.0, "rewards/chosen": 4.061200141906738, "rewards/margins": 0.9646105766296387, "rewards/rejected": 3.0965895652770996, "step": 8368 }, { "epoch": 1.85, "learning_rate": 1.422383641056335e-07, "logits/chosen": -1.7271684408187866, "logits/rejected": -1.8348249197006226, "logps/chosen": -66.6415786743164, "logps/rejected": -120.59970092773438, "loss": 1.192, "rewards/accuracies": 0.0, "rewards/chosen": 5.483394622802734, "rewards/margins": -2.155184268951416, "rewards/rejected": 7.63857889175415, "step": 8369 }, { "epoch": 1.85, "learning_rate": 1.4181420814239565e-07, "logits/chosen": -1.8000595569610596, "logits/rejected": -1.8334698677062988, "logps/chosen": -32.14935302734375, "logps/rejected": -48.477630615234375, "loss": 0.254, "rewards/accuracies": 1.0, "rewards/chosen": 2.4849941730499268, "rewards/margins": 0.5037273168563843, "rewards/rejected": 1.9812668561935425, "step": 8370 }, { "epoch": 1.85, "learning_rate": 1.41390676444354e-07, "logits/chosen": -2.3967816829681396, "logits/rejected": -2.399338960647583, "logps/chosen": -115.25698852539062, "logps/rejected": -131.026611328125, "loss": 0.1189, "rewards/accuracies": 1.0, "rewards/chosen": 10.08258056640625, "rewards/margins": 3.4778685569763184, "rewards/rejected": 6.604712009429932, "step": 8371 }, { "epoch": 1.85, "learning_rate": 1.4096776906593056e-07, "logits/chosen": -2.015249252319336, "logits/rejected": -1.9813475608825684, "logps/chosen": -48.83802795410156, "logps/rejected": -55.301055908203125, "loss": 0.1139, "rewards/accuracies": 1.0, "rewards/chosen": 5.467966556549072, "rewards/margins": 1.4044785499572754, "rewards/rejected": 4.063488006591797, "step": 8372 }, { "epoch": 1.85, "learning_rate": 1.4054548606146744e-07, "logits/chosen": -1.7777924537658691, "logits/rejected": -1.8075525760650635, "logps/chosen": -42.65459442138672, "logps/rejected": -50.47282409667969, "loss": 0.4017, "rewards/accuracies": 1.0, "rewards/chosen": 3.8922958374023438, "rewards/margins": 0.07649827003479004, "rewards/rejected": 3.8157975673675537, "step": 8373 }, { "epoch": 1.85, "learning_rate": 1.401238274852279e-07, "logits/chosen": -2.396106481552124, "logits/rejected": -2.334867238998413, "logps/chosen": -64.89894104003906, "logps/rejected": -38.51959991455078, "loss": 0.3052, "rewards/accuracies": 1.0, "rewards/chosen": 3.4102776050567627, "rewards/margins": 3.3904786109924316, "rewards/rejected": 0.019799042493104935, "step": 8374 }, { "epoch": 1.85, "learning_rate": 1.3970279339139358e-07, "logits/chosen": -1.9483795166015625, "logits/rejected": -1.9860337972640991, "logps/chosen": -59.79212951660156, "logps/rejected": -25.582164764404297, "loss": 0.0647, "rewards/accuracies": 1.0, "rewards/chosen": 5.425131320953369, "rewards/margins": 2.5438764095306396, "rewards/rejected": 2.8812549114227295, "step": 8375 }, { "epoch": 1.85, "learning_rate": 1.3928238383406734e-07, "logits/chosen": -1.7273277044296265, "logits/rejected": -1.706135630607605, "logps/chosen": -37.8858528137207, "logps/rejected": -32.314598083496094, "loss": 0.4477, "rewards/accuracies": 0.0, "rewards/chosen": 2.157745361328125, "rewards/margins": -0.36968469619750977, "rewards/rejected": 2.5274300575256348, "step": 8376 }, { "epoch": 1.85, "learning_rate": 1.388625988672687e-07, "logits/chosen": -1.7566899061203003, "logits/rejected": -1.7256388664245605, "logps/chosen": -46.997039794921875, "logps/rejected": -62.11758041381836, "loss": 0.2502, "rewards/accuracies": 1.0, "rewards/chosen": 3.6344711780548096, "rewards/margins": 0.44600868225097656, "rewards/rejected": 3.188462495803833, "step": 8377 }, { "epoch": 1.85, "learning_rate": 1.3844343854494123e-07, "logits/chosen": -1.734075665473938, "logits/rejected": -1.6439282894134521, "logps/chosen": -26.564462661743164, "logps/rejected": -56.93190002441406, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": 2.8979756832122803, "rewards/margins": 2.450146198272705, "rewards/rejected": 0.4478294551372528, "step": 8378 }, { "epoch": 1.85, "learning_rate": 1.3802490292094407e-07, "logits/chosen": -2.167940616607666, "logits/rejected": -2.167940616607666, "logps/chosen": -63.776695251464844, "logps/rejected": -63.776695251464844, "loss": 0.3508, "rewards/accuracies": 0.0, "rewards/chosen": 5.899709224700928, "rewards/margins": 0.0, "rewards/rejected": 5.899709224700928, "step": 8379 }, { "epoch": 1.85, "learning_rate": 1.3760699204906025e-07, "logits/chosen": -2.0949676036834717, "logits/rejected": -2.2975146770477295, "logps/chosen": -55.98284912109375, "logps/rejected": -91.31578063964844, "loss": 0.4107, "rewards/accuracies": 1.0, "rewards/chosen": 5.809795379638672, "rewards/margins": 0.019338130950927734, "rewards/rejected": 5.790457248687744, "step": 8380 }, { "epoch": 1.86, "learning_rate": 1.3718970598298797e-07, "logits/chosen": -2.01654314994812, "logits/rejected": -1.9542033672332764, "logps/chosen": -129.3572998046875, "logps/rejected": -46.71368408203125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 10.292826652526855, "rewards/margins": 6.06605339050293, "rewards/rejected": 4.226773262023926, "step": 8381 }, { "epoch": 1.86, "learning_rate": 1.3677304477634935e-07, "logits/chosen": -1.9319190979003906, "logits/rejected": -1.8669248819351196, "logps/chosen": -25.99252700805664, "logps/rejected": -14.750801086425781, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 1.5408176183700562, "rewards/margins": 0.642622709274292, "rewards/rejected": 0.8981949090957642, "step": 8382 }, { "epoch": 1.86, "learning_rate": 1.363570084826832e-07, "logits/chosen": -1.8160942792892456, "logits/rejected": -1.9045153856277466, "logps/chosen": -72.68792724609375, "logps/rejected": -101.37150573730469, "loss": 0.5483, "rewards/accuracies": 0.0, "rewards/chosen": 8.500300407409668, "rewards/margins": -0.5702304840087891, "rewards/rejected": 9.070530891418457, "step": 8383 }, { "epoch": 1.86, "learning_rate": 1.3594159715544953e-07, "logits/chosen": -1.729724645614624, "logits/rejected": -1.7117893695831299, "logps/chosen": -37.01945495605469, "logps/rejected": -49.843658447265625, "loss": 1.3568, "rewards/accuracies": 1.0, "rewards/chosen": 3.0536460876464844, "rewards/margins": 0.3173391819000244, "rewards/rejected": 2.73630690574646, "step": 8384 }, { "epoch": 1.86, "learning_rate": 1.3552681084802844e-07, "logits/chosen": -1.5659468173980713, "logits/rejected": -1.5659468173980713, "logps/chosen": -30.044219970703125, "logps/rejected": -30.044219970703125, "loss": 0.525, "rewards/accuracies": 0.0, "rewards/chosen": 5.1301116943359375, "rewards/margins": 0.0, "rewards/rejected": 5.1301116943359375, "step": 8385 }, { "epoch": 1.86, "learning_rate": 1.351126496137173e-07, "logits/chosen": -2.2312068939208984, "logits/rejected": -2.2822024822235107, "logps/chosen": -98.47987365722656, "logps/rejected": -111.65581512451172, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": 9.631550788879395, "rewards/margins": 2.4346413612365723, "rewards/rejected": 7.196909427642822, "step": 8386 }, { "epoch": 1.86, "learning_rate": 1.346991135057363e-07, "logits/chosen": -2.024470567703247, "logits/rejected": -2.024470567703247, "logps/chosen": -21.69886016845703, "logps/rejected": -21.69886016845703, "loss": 0.3549, "rewards/accuracies": 0.0, "rewards/chosen": 3.4892144203186035, "rewards/margins": 0.0, "rewards/rejected": 3.4892144203186035, "step": 8387 }, { "epoch": 1.86, "learning_rate": 1.3428620257722292e-07, "logits/chosen": -2.1674575805664062, "logits/rejected": -2.1538639068603516, "logps/chosen": -158.78121948242188, "logps/rejected": -82.51676940917969, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 11.344382286071777, "rewards/margins": 2.9525680541992188, "rewards/rejected": 8.391814231872559, "step": 8388 }, { "epoch": 1.86, "learning_rate": 1.3387391688123641e-07, "logits/chosen": -2.405351400375366, "logits/rejected": -2.4003283977508545, "logps/chosen": -119.49012756347656, "logps/rejected": -115.79947662353516, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 6.495728969573975, "rewards/margins": 3.1608328819274902, "rewards/rejected": 3.3348960876464844, "step": 8389 }, { "epoch": 1.86, "learning_rate": 1.334622564707533e-07, "logits/chosen": -1.7832329273223877, "logits/rejected": -1.7924846410751343, "logps/chosen": -35.646942138671875, "logps/rejected": -43.644874572753906, "loss": 0.3662, "rewards/accuracies": 1.0, "rewards/chosen": 3.695770263671875, "rewards/margins": 0.5854780673980713, "rewards/rejected": 3.1102921962738037, "step": 8390 }, { "epoch": 1.86, "learning_rate": 1.3305122139867178e-07, "logits/chosen": -1.8141727447509766, "logits/rejected": -1.4974569082260132, "logps/chosen": -78.77101135253906, "logps/rejected": -50.59186553955078, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 7.267053127288818, "rewards/margins": 4.00357723236084, "rewards/rejected": 3.2634758949279785, "step": 8391 }, { "epoch": 1.86, "learning_rate": 1.3264081171780797e-07, "logits/chosen": -2.0598955154418945, "logits/rejected": -1.3765546083450317, "logps/chosen": -61.69799041748047, "logps/rejected": -134.70127868652344, "loss": 0.1125, "rewards/accuracies": 1.0, "rewards/chosen": 6.996924877166748, "rewards/margins": 1.4294185638427734, "rewards/rejected": 5.567506313323975, "step": 8392 }, { "epoch": 1.86, "learning_rate": 1.322310274808991e-07, "logits/chosen": -2.1143603324890137, "logits/rejected": -2.0657174587249756, "logps/chosen": -67.97026824951172, "logps/rejected": -90.86527252197266, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 6.523325443267822, "rewards/margins": 2.807634115219116, "rewards/rejected": 3.715691328048706, "step": 8393 }, { "epoch": 1.86, "learning_rate": 1.3182186874060199e-07, "logits/chosen": -1.8759329319000244, "logits/rejected": -1.819668173789978, "logps/chosen": -68.81448364257812, "logps/rejected": -38.11302947998047, "loss": 0.2133, "rewards/accuracies": 1.0, "rewards/chosen": 3.6168975830078125, "rewards/margins": 0.7697913646697998, "rewards/rejected": 2.8471062183380127, "step": 8394 }, { "epoch": 1.86, "learning_rate": 1.314133355494912e-07, "logits/chosen": -1.6976042985916138, "logits/rejected": -1.6869536638259888, "logps/chosen": -33.130958557128906, "logps/rejected": -37.863441467285156, "loss": 1.4132, "rewards/accuracies": 1.0, "rewards/chosen": 2.600491762161255, "rewards/margins": 0.27022671699523926, "rewards/rejected": 2.3302650451660156, "step": 8395 }, { "epoch": 1.86, "learning_rate": 1.3100542796006366e-07, "logits/chosen": -1.8232101202011108, "logits/rejected": -1.8502779006958008, "logps/chosen": -49.028324127197266, "logps/rejected": -72.17427062988281, "loss": 0.6851, "rewards/accuracies": 0.0, "rewards/chosen": 3.8801732063293457, "rewards/margins": -0.2661557197570801, "rewards/rejected": 4.146328926086426, "step": 8396 }, { "epoch": 1.86, "learning_rate": 1.3059814602473298e-07, "logits/chosen": -1.958817720413208, "logits/rejected": -1.9539031982421875, "logps/chosen": -37.896034240722656, "logps/rejected": -60.973350524902344, "loss": 0.5123, "rewards/accuracies": 0.0, "rewards/chosen": 3.565065860748291, "rewards/margins": -0.38004612922668457, "rewards/rejected": 3.9451119899749756, "step": 8397 }, { "epoch": 1.86, "learning_rate": 1.301914897958362e-07, "logits/chosen": -1.8442435264587402, "logits/rejected": -1.8442435264587402, "logps/chosen": -24.221813201904297, "logps/rejected": -24.221813201904297, "loss": 0.4394, "rewards/accuracies": 0.0, "rewards/chosen": 2.9340102672576904, "rewards/margins": 0.0, "rewards/rejected": 2.9340102672576904, "step": 8398 }, { "epoch": 1.86, "learning_rate": 1.2978545932562537e-07, "logits/chosen": -1.8046246767044067, "logits/rejected": -1.790401816368103, "logps/chosen": -111.7587890625, "logps/rejected": -103.89236450195312, "loss": 0.0866, "rewards/accuracies": 1.0, "rewards/chosen": 9.090753555297852, "rewards/margins": 2.2712907791137695, "rewards/rejected": 6.819462776184082, "step": 8399 }, { "epoch": 1.86, "learning_rate": 1.2938005466627545e-07, "logits/chosen": -1.7229130268096924, "logits/rejected": -1.8164118528366089, "logps/chosen": -105.71467590332031, "logps/rejected": -57.833229064941406, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": 7.540138244628906, "rewards/margins": 2.3158440589904785, "rewards/rejected": 5.224294185638428, "step": 8400 }, { "epoch": 1.86, "learning_rate": 1.289752758698798e-07, "logits/chosen": -2.006612777709961, "logits/rejected": -1.9786202907562256, "logps/chosen": -76.21652221679688, "logps/rejected": -217.8487548828125, "loss": 0.057, "rewards/accuracies": 1.0, "rewards/chosen": 12.357464790344238, "rewards/margins": 2.8277339935302734, "rewards/rejected": 9.529730796813965, "step": 8401 }, { "epoch": 1.86, "learning_rate": 1.2857112298845175e-07, "logits/chosen": -2.031960964202881, "logits/rejected": -2.0101559162139893, "logps/chosen": -27.830284118652344, "logps/rejected": -45.33375549316406, "loss": 0.6744, "rewards/accuracies": 1.0, "rewards/chosen": 3.94758677482605, "rewards/margins": 2.577785015106201, "rewards/rejected": 1.3698017597198486, "step": 8402 }, { "epoch": 1.86, "learning_rate": 1.2816759607392372e-07, "logits/chosen": -1.9243667125701904, "logits/rejected": -1.9126691818237305, "logps/chosen": -52.58648681640625, "logps/rejected": -79.64100646972656, "loss": 0.166, "rewards/accuracies": 1.0, "rewards/chosen": 6.3795671463012695, "rewards/margins": 2.094407081604004, "rewards/rejected": 4.285160064697266, "step": 8403 }, { "epoch": 1.86, "learning_rate": 1.2776469517814803e-07, "logits/chosen": -1.8988347053527832, "logits/rejected": -1.9342460632324219, "logps/chosen": -48.321617126464844, "logps/rejected": -85.56350708007812, "loss": 0.7284, "rewards/accuracies": 0.0, "rewards/chosen": 3.643404483795166, "rewards/margins": -1.026120662689209, "rewards/rejected": 4.669525146484375, "step": 8404 }, { "epoch": 1.86, "learning_rate": 1.273624203528967e-07, "logits/chosen": -2.1960535049438477, "logits/rejected": -2.1692121028900146, "logps/chosen": -71.39274597167969, "logps/rejected": -85.79051208496094, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": 8.073145866394043, "rewards/margins": 5.26249361038208, "rewards/rejected": 2.810652256011963, "step": 8405 }, { "epoch": 1.86, "learning_rate": 1.2696077164986e-07, "logits/chosen": -1.8229739665985107, "logits/rejected": -1.7462092638015747, "logps/chosen": -98.60235595703125, "logps/rejected": -107.46623229980469, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": 6.660301208496094, "rewards/margins": 3.50634765625, "rewards/rejected": 3.1539535522460938, "step": 8406 }, { "epoch": 1.86, "learning_rate": 1.2655974912065106e-07, "logits/chosen": -1.9806805849075317, "logits/rejected": -1.9695229530334473, "logps/chosen": -170.7331085205078, "logps/rejected": -63.34043502807617, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 12.43915843963623, "rewards/margins": 3.519801139831543, "rewards/rejected": 8.919357299804688, "step": 8407 }, { "epoch": 1.86, "learning_rate": 1.2615935281679815e-07, "logits/chosen": -1.8262670040130615, "logits/rejected": -1.7910531759262085, "logps/chosen": -54.594154357910156, "logps/rejected": -68.33287048339844, "loss": 0.2126, "rewards/accuracies": 1.0, "rewards/chosen": 3.166278839111328, "rewards/margins": 0.9896736145019531, "rewards/rejected": 2.176605224609375, "step": 8408 }, { "epoch": 1.86, "learning_rate": 1.2575958278975176e-07, "logits/chosen": -2.2421927452087402, "logits/rejected": -2.2694151401519775, "logps/chosen": -138.20179748535156, "logps/rejected": -178.57566833496094, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 14.140227317810059, "rewards/margins": 3.7051000595092773, "rewards/rejected": 10.435127258300781, "step": 8409 }, { "epoch": 1.86, "learning_rate": 1.253604390908819e-07, "logits/chosen": -1.7538809776306152, "logits/rejected": -1.7606830596923828, "logps/chosen": -37.151817321777344, "logps/rejected": -65.1183090209961, "loss": 0.36, "rewards/accuracies": 1.0, "rewards/chosen": 2.832475423812866, "rewards/margins": 0.014128923416137695, "rewards/rejected": 2.8183465003967285, "step": 8410 }, { "epoch": 1.86, "learning_rate": 1.249619217714776e-07, "logits/chosen": -1.9638856649398804, "logits/rejected": -1.9254035949707031, "logps/chosen": -130.6030731201172, "logps/rejected": -93.33272552490234, "loss": 0.0706, "rewards/accuracies": 1.0, "rewards/chosen": 7.543862819671631, "rewards/margins": 1.9681205749511719, "rewards/rejected": 5.575742244720459, "step": 8411 }, { "epoch": 1.86, "learning_rate": 1.2456403088274672e-07, "logits/chosen": -1.9526019096374512, "logits/rejected": -1.8576358556747437, "logps/chosen": -40.38459014892578, "logps/rejected": -18.98145294189453, "loss": 0.1136, "rewards/accuracies": 1.0, "rewards/chosen": 2.5242364406585693, "rewards/margins": 1.6323931217193604, "rewards/rejected": 0.8918432593345642, "step": 8412 }, { "epoch": 1.86, "learning_rate": 1.2416676647581782e-07, "logits/chosen": -1.672112226486206, "logits/rejected": -1.6210421323776245, "logps/chosen": -36.232933044433594, "logps/rejected": -64.60147094726562, "loss": 0.5698, "rewards/accuracies": 1.0, "rewards/chosen": 5.107068061828613, "rewards/margins": 1.1125459671020508, "rewards/rejected": 3.9945220947265625, "step": 8413 }, { "epoch": 1.86, "learning_rate": 1.2377012860173786e-07, "logits/chosen": -2.033846855163574, "logits/rejected": -2.0530567169189453, "logps/chosen": -45.67766571044922, "logps/rejected": -107.72564697265625, "loss": 0.2188, "rewards/accuracies": 1.0, "rewards/chosen": 3.643995761871338, "rewards/margins": 0.6618821620941162, "rewards/rejected": 2.9821135997772217, "step": 8414 }, { "epoch": 1.86, "learning_rate": 1.2337411731147498e-07, "logits/chosen": -1.847442865371704, "logits/rejected": -1.8834246397018433, "logps/chosen": -35.789188385009766, "logps/rejected": -92.20657348632812, "loss": 0.563, "rewards/accuracies": 0.0, "rewards/chosen": 4.505070209503174, "rewards/margins": -0.44933652877807617, "rewards/rejected": 4.95440673828125, "step": 8415 }, { "epoch": 1.86, "learning_rate": 1.2297873265591453e-07, "logits/chosen": -2.040916919708252, "logits/rejected": -2.0949535369873047, "logps/chosen": -107.41069030761719, "logps/rejected": -105.6158218383789, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 10.50480842590332, "rewards/margins": 5.194851398468018, "rewards/rejected": 5.309957027435303, "step": 8416 }, { "epoch": 1.86, "learning_rate": 1.2258397468586314e-07, "logits/chosen": -1.9592828750610352, "logits/rejected": -1.9696521759033203, "logps/chosen": -37.365272521972656, "logps/rejected": -43.16847229003906, "loss": 0.2828, "rewards/accuracies": 1.0, "rewards/chosen": 2.812857151031494, "rewards/margins": 0.5244889259338379, "rewards/rejected": 2.2883682250976562, "step": 8417 }, { "epoch": 1.86, "learning_rate": 1.2218984345204576e-07, "logits/chosen": -2.2123944759368896, "logits/rejected": -2.2529239654541016, "logps/chosen": -48.51738357543945, "logps/rejected": -58.5799446105957, "loss": 0.7007, "rewards/accuracies": 0.0, "rewards/chosen": 3.965444564819336, "rewards/margins": -0.8269171714782715, "rewards/rejected": 4.792361736297607, "step": 8418 }, { "epoch": 1.86, "learning_rate": 1.2179633900510745e-07, "logits/chosen": -1.6660360097885132, "logits/rejected": -1.6229569911956787, "logps/chosen": -51.459449768066406, "logps/rejected": -67.80335998535156, "loss": 0.0639, "rewards/accuracies": 1.0, "rewards/chosen": 3.5200157165527344, "rewards/margins": 2.5045461654663086, "rewards/rejected": 1.0154694318771362, "step": 8419 }, { "epoch": 1.86, "learning_rate": 1.2140346139561277e-07, "logits/chosen": -1.7165533304214478, "logits/rejected": -1.6793277263641357, "logps/chosen": -79.21563720703125, "logps/rejected": -73.0077896118164, "loss": 0.1763, "rewards/accuracies": 1.0, "rewards/chosen": 7.02308988571167, "rewards/margins": 1.6358566284179688, "rewards/rejected": 5.387233257293701, "step": 8420 }, { "epoch": 1.86, "learning_rate": 1.2101121067404576e-07, "logits/chosen": -2.302992343902588, "logits/rejected": -2.3548152446746826, "logps/chosen": -116.99005126953125, "logps/rejected": -59.21376037597656, "loss": 0.0953, "rewards/accuracies": 1.0, "rewards/chosen": 6.503836154937744, "rewards/margins": 1.57342529296875, "rewards/rejected": 4.930410861968994, "step": 8421 }, { "epoch": 1.86, "learning_rate": 1.206195868908089e-07, "logits/chosen": -2.119135618209839, "logits/rejected": -2.1627538204193115, "logps/chosen": -36.05763244628906, "logps/rejected": -118.18341827392578, "loss": 2.1029, "rewards/accuracies": 0.0, "rewards/chosen": 5.853466033935547, "rewards/margins": -4.142587661743164, "rewards/rejected": 9.996053695678711, "step": 8422 }, { "epoch": 1.86, "learning_rate": 1.2022859009622578e-07, "logits/chosen": -1.82071852684021, "logits/rejected": -1.808402180671692, "logps/chosen": -43.818084716796875, "logps/rejected": -76.00862121582031, "loss": 0.1659, "rewards/accuracies": 1.0, "rewards/chosen": 2.9814612865448, "rewards/margins": 1.2117103338241577, "rewards/rejected": 1.769750952720642, "step": 8423 }, { "epoch": 1.86, "learning_rate": 1.198382203405385e-07, "logits/chosen": -2.0722455978393555, "logits/rejected": -2.0513830184936523, "logps/chosen": -120.44174194335938, "logps/rejected": -60.33441162109375, "loss": 0.0645, "rewards/accuracies": 1.0, "rewards/chosen": 8.988736152648926, "rewards/margins": 2.203540325164795, "rewards/rejected": 6.785195827484131, "step": 8424 }, { "epoch": 1.86, "learning_rate": 1.19448477673908e-07, "logits/chosen": -1.8961435556411743, "logits/rejected": -1.9430677890777588, "logps/chosen": -35.2376708984375, "logps/rejected": -88.07798767089844, "loss": 0.8747, "rewards/accuracies": 0.0, "rewards/chosen": 4.062364101409912, "rewards/margins": -1.4726533889770508, "rewards/rejected": 5.535017490386963, "step": 8425 }, { "epoch": 1.86, "learning_rate": 1.1905936214641533e-07, "logits/chosen": -1.633858323097229, "logits/rejected": -1.633858323097229, "logps/chosen": -47.43268966674805, "logps/rejected": -47.43268966674805, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 2.7619495391845703, "rewards/margins": 0.0, "rewards/rejected": 2.7619495391845703, "step": 8426 }, { "epoch": 1.87, "learning_rate": 1.1867087380806164e-07, "logits/chosen": -1.9224166870117188, "logits/rejected": -1.9224166870117188, "logps/chosen": -117.00093841552734, "logps/rejected": -117.00093841552734, "loss": 0.3496, "rewards/accuracies": 0.0, "rewards/chosen": 13.08398151397705, "rewards/margins": 0.0, "rewards/rejected": 13.08398151397705, "step": 8427 }, { "epoch": 1.87, "learning_rate": 1.1828301270876585e-07, "logits/chosen": -2.1370596885681152, "logits/rejected": -2.1896681785583496, "logps/chosen": -38.178009033203125, "logps/rejected": -106.68214416503906, "loss": 2.0627, "rewards/accuracies": 0.0, "rewards/chosen": 4.313673496246338, "rewards/margins": -4.103628635406494, "rewards/rejected": 8.417302131652832, "step": 8428 }, { "epoch": 1.87, "learning_rate": 1.1789577889836757e-07, "logits/chosen": -1.8912005424499512, "logits/rejected": -1.8912005424499512, "logps/chosen": -64.89576721191406, "logps/rejected": -64.89576721191406, "loss": 0.4022, "rewards/accuracies": 0.0, "rewards/chosen": 5.064647674560547, "rewards/margins": 0.0, "rewards/rejected": 5.064647674560547, "step": 8429 }, { "epoch": 1.87, "learning_rate": 1.1750917242662585e-07, "logits/chosen": -2.0123846530914307, "logits/rejected": -1.9526150226593018, "logps/chosen": -51.86107635498047, "logps/rejected": -10.382430076599121, "loss": 0.0707, "rewards/accuracies": 1.0, "rewards/chosen": 5.859729766845703, "rewards/margins": 4.625503063201904, "rewards/rejected": 1.2342267036437988, "step": 8430 }, { "epoch": 1.87, "learning_rate": 1.171231933432182e-07, "logits/chosen": -1.9484225511550903, "logits/rejected": -1.9522312879562378, "logps/chosen": -56.976036071777344, "logps/rejected": -97.25689697265625, "loss": 0.6615, "rewards/accuracies": 1.0, "rewards/chosen": 4.834687232971191, "rewards/margins": 0.15083074569702148, "rewards/rejected": 4.68385648727417, "step": 8431 }, { "epoch": 1.87, "learning_rate": 1.167378416977416e-07, "logits/chosen": -1.4799963235855103, "logits/rejected": -1.4255791902542114, "logps/chosen": -39.38087463378906, "logps/rejected": -39.965667724609375, "loss": 0.2634, "rewards/accuracies": 1.0, "rewards/chosen": 3.342792510986328, "rewards/margins": 0.656883955001831, "rewards/rejected": 2.685908555984497, "step": 8432 }, { "epoch": 1.87, "learning_rate": 1.1635311753971424e-07, "logits/chosen": -2.146867036819458, "logits/rejected": -2.133427143096924, "logps/chosen": -37.00115966796875, "logps/rejected": -46.16704559326172, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": 3.5555222034454346, "rewards/margins": 2.003175735473633, "rewards/rejected": 1.5523464679718018, "step": 8433 }, { "epoch": 1.87, "learning_rate": 1.1596902091857043e-07, "logits/chosen": -1.9648079872131348, "logits/rejected": -1.9648079872131348, "logps/chosen": -36.098819732666016, "logps/rejected": -36.098819732666016, "loss": 0.4529, "rewards/accuracies": 0.0, "rewards/chosen": 4.854457378387451, "rewards/margins": 0.0, "rewards/rejected": 4.854457378387451, "step": 8434 }, { "epoch": 1.87, "learning_rate": 1.1558555188366737e-07, "logits/chosen": -1.9603749513626099, "logits/rejected": -1.9543092250823975, "logps/chosen": -16.922229766845703, "logps/rejected": -36.26106643676758, "loss": 1.4582, "rewards/accuracies": 1.0, "rewards/chosen": 2.3379154205322266, "rewards/margins": 1.1436408758163452, "rewards/rejected": 1.1942745447158813, "step": 8435 }, { "epoch": 1.87, "learning_rate": 1.1520271048427844e-07, "logits/chosen": -1.7947851419448853, "logits/rejected": -1.8201258182525635, "logps/chosen": -27.20592498779297, "logps/rejected": -48.41548538208008, "loss": 0.5085, "rewards/accuracies": 0.0, "rewards/chosen": 3.258441925048828, "rewards/margins": -0.5562870502471924, "rewards/rejected": 3.8147289752960205, "step": 8436 }, { "epoch": 1.87, "learning_rate": 1.1482049676959872e-07, "logits/chosen": -2.2242841720581055, "logits/rejected": -2.2185356616973877, "logps/chosen": -101.30218505859375, "logps/rejected": -124.28832244873047, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 11.84594440460205, "rewards/margins": 1.4013938903808594, "rewards/rejected": 10.444550514221191, "step": 8437 }, { "epoch": 1.87, "learning_rate": 1.1443891078874115e-07, "logits/chosen": -1.8527798652648926, "logits/rejected": -1.854783058166504, "logps/chosen": -56.54080581665039, "logps/rejected": -39.09231948852539, "loss": 0.2845, "rewards/accuracies": 1.0, "rewards/chosen": 3.190126419067383, "rewards/margins": 0.8354620933532715, "rewards/rejected": 2.3546643257141113, "step": 8438 }, { "epoch": 1.87, "learning_rate": 1.1405795259073927e-07, "logits/chosen": -1.9801238775253296, "logits/rejected": -1.9801238775253296, "logps/chosen": -28.63471031188965, "logps/rejected": -28.63471031188965, "loss": 0.5123, "rewards/accuracies": 0.0, "rewards/chosen": 7.506032466888428, "rewards/margins": 0.0, "rewards/rejected": 7.506032466888428, "step": 8439 }, { "epoch": 1.87, "learning_rate": 1.1367762222454503e-07, "logits/chosen": -2.0324788093566895, "logits/rejected": -1.9756959676742554, "logps/chosen": -96.70884704589844, "logps/rejected": -19.062610626220703, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": 6.577919006347656, "rewards/margins": 4.485651016235352, "rewards/rejected": 2.0922677516937256, "step": 8440 }, { "epoch": 1.87, "learning_rate": 1.1329791973902993e-07, "logits/chosen": -2.056852102279663, "logits/rejected": -2.0313076972961426, "logps/chosen": -71.413818359375, "logps/rejected": -78.9249038696289, "loss": 0.118, "rewards/accuracies": 1.0, "rewards/chosen": 9.19108772277832, "rewards/margins": 4.9361748695373535, "rewards/rejected": 4.254912853240967, "step": 8441 }, { "epoch": 1.87, "learning_rate": 1.129188451829849e-07, "logits/chosen": -2.06327748298645, "logits/rejected": -2.00653338432312, "logps/chosen": -45.14799499511719, "logps/rejected": -4.933607578277588, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": 2.8538124561309814, "rewards/margins": 2.118454694747925, "rewards/rejected": 0.7353577017784119, "step": 8442 }, { "epoch": 1.87, "learning_rate": 1.1254039860511989e-07, "logits/chosen": -2.1847825050354004, "logits/rejected": -1.954992651939392, "logps/chosen": -141.08935546875, "logps/rejected": -94.71471405029297, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 7.9290452003479, "rewards/margins": 9.77861499786377, "rewards/rejected": -1.8495696783065796, "step": 8443 }, { "epoch": 1.87, "learning_rate": 1.1216258005406489e-07, "logits/chosen": -1.8468432426452637, "logits/rejected": -1.8325397968292236, "logps/chosen": -57.8902587890625, "logps/rejected": -50.2906494140625, "loss": 0.2149, "rewards/accuracies": 1.0, "rewards/chosen": 3.879828691482544, "rewards/margins": 0.6435370445251465, "rewards/rejected": 3.2362916469573975, "step": 8444 }, { "epoch": 1.87, "learning_rate": 1.1178538957836771e-07, "logits/chosen": -1.6748005151748657, "logits/rejected": -1.666795015335083, "logps/chosen": -37.14796447753906, "logps/rejected": -42.37907409667969, "loss": 0.2117, "rewards/accuracies": 1.0, "rewards/chosen": 3.9466965198516846, "rewards/margins": 0.7461585998535156, "rewards/rejected": 3.200537919998169, "step": 8445 }, { "epoch": 1.87, "learning_rate": 1.1140882722649737e-07, "logits/chosen": -2.025432586669922, "logits/rejected": -2.0213475227355957, "logps/chosen": -28.884963989257812, "logps/rejected": -29.281715393066406, "loss": 0.3963, "rewards/accuracies": 0.0, "rewards/chosen": 2.772167682647705, "rewards/margins": -0.1106255054473877, "rewards/rejected": 2.8827931880950928, "step": 8446 }, { "epoch": 1.87, "learning_rate": 1.1103289304684073e-07, "logits/chosen": -2.073863983154297, "logits/rejected": -2.090193271636963, "logps/chosen": -38.00743865966797, "logps/rejected": -70.17066955566406, "loss": 0.7923, "rewards/accuracies": 0.0, "rewards/chosen": 2.876448154449463, "rewards/margins": -1.240830898284912, "rewards/rejected": 4.117279052734375, "step": 8447 }, { "epoch": 1.87, "learning_rate": 1.1065758708770468e-07, "logits/chosen": -2.002027750015259, "logits/rejected": -2.0209381580352783, "logps/chosen": -49.20159149169922, "logps/rejected": -59.758201599121094, "loss": 0.5716, "rewards/accuracies": 1.0, "rewards/chosen": 3.1017441749572754, "rewards/margins": 0.8084657192230225, "rewards/rejected": 2.293278455734253, "step": 8448 }, { "epoch": 1.87, "learning_rate": 1.1028290939731512e-07, "logits/chosen": -1.8369897603988647, "logits/rejected": -1.8369897603988647, "logps/chosen": -77.38482666015625, "logps/rejected": -77.38482666015625, "loss": 0.3639, "rewards/accuracies": 0.0, "rewards/chosen": 7.173559665679932, "rewards/margins": 0.0, "rewards/rejected": 7.173559665679932, "step": 8449 }, { "epoch": 1.87, "learning_rate": 1.099088600238174e-07, "logits/chosen": -1.8738528490066528, "logits/rejected": -1.8816806077957153, "logps/chosen": -67.31211853027344, "logps/rejected": -84.29376220703125, "loss": 0.3606, "rewards/accuracies": 1.0, "rewards/chosen": 4.095864772796631, "rewards/margins": 1.0806586742401123, "rewards/rejected": 3.0152060985565186, "step": 8450 }, { "epoch": 1.87, "learning_rate": 1.0953543901527585e-07, "logits/chosen": -1.959443211555481, "logits/rejected": -1.959443211555481, "logps/chosen": -107.61392211914062, "logps/rejected": -107.61392211914062, "loss": 0.3494, "rewards/accuracies": 0.0, "rewards/chosen": 8.947076797485352, "rewards/margins": 0.0, "rewards/rejected": 8.947076797485352, "step": 8451 }, { "epoch": 1.87, "learning_rate": 1.0916264641967267e-07, "logits/chosen": -1.8750920295715332, "logits/rejected": -1.7912640571594238, "logps/chosen": -28.83835220336914, "logps/rejected": -34.55778121948242, "loss": 0.9362, "rewards/accuracies": 1.0, "rewards/chosen": 4.016355514526367, "rewards/margins": 0.29440903663635254, "rewards/rejected": 3.7219464778900146, "step": 8452 }, { "epoch": 1.87, "learning_rate": 1.0879048228491396e-07, "logits/chosen": -1.7668602466583252, "logits/rejected": -1.7562336921691895, "logps/chosen": -27.68526840209961, "logps/rejected": -59.769561767578125, "loss": 0.4383, "rewards/accuracies": 0.0, "rewards/chosen": 3.1972897052764893, "rewards/margins": -0.05074191093444824, "rewards/rejected": 3.2480316162109375, "step": 8453 }, { "epoch": 1.87, "learning_rate": 1.084189466588187e-07, "logits/chosen": -1.7947989702224731, "logits/rejected": -1.6827256679534912, "logps/chosen": -78.51506042480469, "logps/rejected": -59.01532745361328, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": 6.385885715484619, "rewards/margins": 1.8941888809204102, "rewards/rejected": 4.491696834564209, "step": 8454 }, { "epoch": 1.87, "learning_rate": 1.0804803958913035e-07, "logits/chosen": -1.7830533981323242, "logits/rejected": -1.7830533981323242, "logps/chosen": -42.58960723876953, "logps/rejected": -42.58960723876953, "loss": 0.3478, "rewards/accuracies": 0.0, "rewards/chosen": 2.0532214641571045, "rewards/margins": 0.0, "rewards/rejected": 2.0532214641571045, "step": 8455 }, { "epoch": 1.87, "learning_rate": 1.0767776112350914e-07, "logits/chosen": -1.8169493675231934, "logits/rejected": -1.8870210647583008, "logps/chosen": -40.77006912231445, "logps/rejected": -148.64520263671875, "loss": 0.6364, "rewards/accuracies": 0.0, "rewards/chosen": 4.693431377410889, "rewards/margins": -0.9362883567810059, "rewards/rejected": 5.6297197341918945, "step": 8456 }, { "epoch": 1.87, "learning_rate": 1.0730811130953477e-07, "logits/chosen": -2.0471274852752686, "logits/rejected": -1.9837297201156616, "logps/chosen": -61.8006591796875, "logps/rejected": -14.270133972167969, "loss": 0.2109, "rewards/accuracies": 1.0, "rewards/chosen": 4.090843200683594, "rewards/margins": 0.803905725479126, "rewards/rejected": 3.2869374752044678, "step": 8457 }, { "epoch": 1.87, "learning_rate": 1.0693909019470594e-07, "logits/chosen": -2.0933895111083984, "logits/rejected": -2.1073336601257324, "logps/chosen": -73.42882537841797, "logps/rejected": -126.59080505371094, "loss": 0.2012, "rewards/accuracies": 1.0, "rewards/chosen": 9.268192291259766, "rewards/margins": 0.821110725402832, "rewards/rejected": 8.447081565856934, "step": 8458 }, { "epoch": 1.87, "learning_rate": 1.0657069782644191e-07, "logits/chosen": -1.8257941007614136, "logits/rejected": -1.8257941007614136, "logps/chosen": -41.0033073425293, "logps/rejected": -41.0033073425293, "loss": 0.5046, "rewards/accuracies": 0.0, "rewards/chosen": 4.374295711517334, "rewards/margins": 0.0, "rewards/rejected": 4.374295711517334, "step": 8459 }, { "epoch": 1.87, "learning_rate": 1.0620293425207984e-07, "logits/chosen": -2.0102179050445557, "logits/rejected": -2.083468198776245, "logps/chosen": -64.03691864013672, "logps/rejected": -112.52976989746094, "loss": 0.1441, "rewards/accuracies": 1.0, "rewards/chosen": 8.34342098236084, "rewards/margins": 1.1043448448181152, "rewards/rejected": 7.239076137542725, "step": 8460 }, { "epoch": 1.87, "learning_rate": 1.058357995188758e-07, "logits/chosen": -1.9440293312072754, "logits/rejected": -2.041588068008423, "logps/chosen": -67.47117614746094, "logps/rejected": -244.01583862304688, "loss": 0.5073, "rewards/accuracies": 1.0, "rewards/chosen": 12.43528938293457, "rewards/margins": 5.558726787567139, "rewards/rejected": 6.876562595367432, "step": 8461 }, { "epoch": 1.87, "learning_rate": 1.0546929367400705e-07, "logits/chosen": -1.8119022846221924, "logits/rejected": -1.8119022846221924, "logps/chosen": -32.96959686279297, "logps/rejected": -32.96959686279297, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 2.1254982948303223, "rewards/margins": 0.0, "rewards/rejected": 2.1254982948303223, "step": 8462 }, { "epoch": 1.87, "learning_rate": 1.0510341676456648e-07, "logits/chosen": -1.6206488609313965, "logits/rejected": -1.5047491788864136, "logps/chosen": -18.82492446899414, "logps/rejected": -29.804927825927734, "loss": 0.2103, "rewards/accuracies": 1.0, "rewards/chosen": 1.7237904071807861, "rewards/margins": 0.6649292707443237, "rewards/rejected": 1.0588611364364624, "step": 8463 }, { "epoch": 1.87, "learning_rate": 1.0473816883757149e-07, "logits/chosen": -1.735437035560608, "logits/rejected": -1.735437035560608, "logps/chosen": -34.96452331542969, "logps/rejected": -34.96452331542969, "loss": 0.4014, "rewards/accuracies": 0.0, "rewards/chosen": 3.8724288940429688, "rewards/margins": 0.0, "rewards/rejected": 3.8724288940429688, "step": 8464 }, { "epoch": 1.87, "learning_rate": 1.0437354993995341e-07, "logits/chosen": -2.3535542488098145, "logits/rejected": -2.3520572185516357, "logps/chosen": -45.49825668334961, "logps/rejected": -32.75348663330078, "loss": 0.3595, "rewards/accuracies": 0.0, "rewards/chosen": 4.3806867599487305, "rewards/margins": -0.0381312370300293, "rewards/rejected": 4.41881799697876, "step": 8465 }, { "epoch": 1.87, "learning_rate": 1.0400956011856478e-07, "logits/chosen": -2.163444995880127, "logits/rejected": -2.0497090816497803, "logps/chosen": -94.52164459228516, "logps/rejected": -51.918357849121094, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 7.997727394104004, "rewards/margins": 6.496200084686279, "rewards/rejected": 1.5015274286270142, "step": 8466 }, { "epoch": 1.87, "learning_rate": 1.0364619942017928e-07, "logits/chosen": -1.7356979846954346, "logits/rejected": -1.801639437675476, "logps/chosen": -20.711181640625, "logps/rejected": -124.81615447998047, "loss": 0.3835, "rewards/accuracies": 0.0, "rewards/chosen": 6.690667152404785, "rewards/margins": -0.04116058349609375, "rewards/rejected": 6.731827735900879, "step": 8467 }, { "epoch": 1.87, "learning_rate": 1.0328346789148513e-07, "logits/chosen": -1.9414432048797607, "logits/rejected": -1.915636420249939, "logps/chosen": -32.0118522644043, "logps/rejected": -47.38330841064453, "loss": 0.1729, "rewards/accuracies": 1.0, "rewards/chosen": 3.2330920696258545, "rewards/margins": 0.9828281402587891, "rewards/rejected": 2.2502639293670654, "step": 8468 }, { "epoch": 1.87, "learning_rate": 1.0292136557909505e-07, "logits/chosen": -2.0276787281036377, "logits/rejected": -2.0343880653381348, "logps/chosen": -30.643808364868164, "logps/rejected": -44.14088439941406, "loss": 0.9438, "rewards/accuracies": 0.0, "rewards/chosen": 3.1617777347564697, "rewards/margins": -1.334315538406372, "rewards/rejected": 4.496093273162842, "step": 8469 }, { "epoch": 1.87, "learning_rate": 1.025598925295368e-07, "logits/chosen": -2.0226876735687256, "logits/rejected": -2.025014638900757, "logps/chosen": -101.78224182128906, "logps/rejected": -136.57325744628906, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": 10.201746940612793, "rewards/margins": 2.986445426940918, "rewards/rejected": 7.215301513671875, "step": 8470 }, { "epoch": 1.87, "learning_rate": 1.0219904878925935e-07, "logits/chosen": -2.139136552810669, "logits/rejected": -2.115382432937622, "logps/chosen": -45.000450134277344, "logps/rejected": -64.21387481689453, "loss": 0.2367, "rewards/accuracies": 1.0, "rewards/chosen": 4.413218975067139, "rewards/margins": 0.566967248916626, "rewards/rejected": 3.8462517261505127, "step": 8471 }, { "epoch": 1.88, "learning_rate": 1.0183883440463004e-07, "logits/chosen": -1.6470866203308105, "logits/rejected": -1.6527191400527954, "logps/chosen": -47.13562774658203, "logps/rejected": -55.336002349853516, "loss": 0.296, "rewards/accuracies": 1.0, "rewards/chosen": 2.9792137145996094, "rewards/margins": 0.35562777519226074, "rewards/rejected": 2.6235859394073486, "step": 8472 }, { "epoch": 1.88, "learning_rate": 1.0147924942193576e-07, "logits/chosen": -1.6971527338027954, "logits/rejected": -1.2962608337402344, "logps/chosen": -37.98527526855469, "logps/rejected": -138.02719116210938, "loss": 0.3952, "rewards/accuracies": 1.0, "rewards/chosen": 5.164924621582031, "rewards/margins": 0.1579270362854004, "rewards/rejected": 5.006997585296631, "step": 8473 }, { "epoch": 1.88, "learning_rate": 1.0112029388738231e-07, "logits/chosen": -1.6439990997314453, "logits/rejected": -1.6268912553787231, "logps/chosen": -25.907527923583984, "logps/rejected": -55.55247497558594, "loss": 0.2044, "rewards/accuracies": 1.0, "rewards/chosen": 3.7294838428497314, "rewards/margins": 1.3088538646697998, "rewards/rejected": 2.4206299781799316, "step": 8474 }, { "epoch": 1.88, "learning_rate": 1.0076196784709446e-07, "logits/chosen": -1.9463467597961426, "logits/rejected": -1.9005236625671387, "logps/chosen": -39.76057052612305, "logps/rejected": -49.658966064453125, "loss": 0.4534, "rewards/accuracies": 0.0, "rewards/chosen": 3.2058537006378174, "rewards/margins": -0.32556581497192383, "rewards/rejected": 3.531419515609741, "step": 8475 }, { "epoch": 1.88, "learning_rate": 1.004042713471165e-07, "logits/chosen": -2.014409303665161, "logits/rejected": -1.993787169456482, "logps/chosen": -51.30048751831055, "logps/rejected": -49.25358581542969, "loss": 0.186, "rewards/accuracies": 1.0, "rewards/chosen": 4.551417350769043, "rewards/margins": 1.5487477779388428, "rewards/rejected": 3.0026695728302, "step": 8476 }, { "epoch": 1.88, "learning_rate": 1.0004720443341054e-07, "logits/chosen": -1.9103587865829468, "logits/rejected": -1.9257313013076782, "logps/chosen": -18.898826599121094, "logps/rejected": -46.86686706542969, "loss": 0.2441, "rewards/accuracies": 1.0, "rewards/chosen": 3.99725341796875, "rewards/margins": 0.5620901584625244, "rewards/rejected": 3.4351632595062256, "step": 8477 }, { "epoch": 1.88, "learning_rate": 9.9690767151861e-08, "logits/chosen": -1.83097243309021, "logits/rejected": -1.7699388265609741, "logps/chosen": -93.04899597167969, "logps/rejected": -38.35102844238281, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 6.10378885269165, "rewards/margins": 2.489816427230835, "rewards/rejected": 3.6139724254608154, "step": 8478 }, { "epoch": 1.88, "learning_rate": 9.933495954826622e-08, "logits/chosen": -1.6914606094360352, "logits/rejected": -1.6800984144210815, "logps/chosen": -50.927677154541016, "logps/rejected": -50.95539093017578, "loss": 0.3377, "rewards/accuracies": 1.0, "rewards/chosen": 3.5563457012176514, "rewards/margins": 1.4916858673095703, "rewards/rejected": 2.064659833908081, "step": 8479 }, { "epoch": 1.88, "learning_rate": 9.897978166834966e-08, "logits/chosen": -2.126286029815674, "logits/rejected": -1.9603809118270874, "logps/chosen": -119.33977508544922, "logps/rejected": -41.180885314941406, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 9.21933364868164, "rewards/margins": 3.265367031097412, "rewards/rejected": 5.9539666175842285, "step": 8480 }, { "epoch": 1.88, "learning_rate": 9.862523355774867e-08, "logits/chosen": -1.913432240486145, "logits/rejected": -1.913432240486145, "logps/chosen": -37.72174072265625, "logps/rejected": -37.72174072265625, "loss": 0.3673, "rewards/accuracies": 0.0, "rewards/chosen": 4.310133457183838, "rewards/margins": 0.0, "rewards/rejected": 4.310133457183838, "step": 8481 }, { "epoch": 1.88, "learning_rate": 9.827131526202294e-08, "logits/chosen": -1.9439274072647095, "logits/rejected": -1.7067866325378418, "logps/chosen": -140.20535278320312, "logps/rejected": -52.450927734375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 10.009650230407715, "rewards/margins": 6.354179859161377, "rewards/rejected": 3.655470371246338, "step": 8482 }, { "epoch": 1.88, "learning_rate": 9.791802682664941e-08, "logits/chosen": -1.8561545610427856, "logits/rejected": -1.8289529085159302, "logps/chosen": -27.624338150024414, "logps/rejected": -31.194852828979492, "loss": 0.4789, "rewards/accuracies": 0.0, "rewards/chosen": 2.2475087642669678, "rewards/margins": -0.11063432693481445, "rewards/rejected": 2.3581430912017822, "step": 8483 }, { "epoch": 1.88, "learning_rate": 9.756536829702568e-08, "logits/chosen": -1.732530951499939, "logits/rejected": -1.5081700086593628, "logps/chosen": -101.91725158691406, "logps/rejected": -29.224681854248047, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 8.180429458618164, "rewards/margins": 3.339749813079834, "rewards/rejected": 4.84067964553833, "step": 8484 }, { "epoch": 1.88, "learning_rate": 9.721333971846658e-08, "logits/chosen": -1.984460711479187, "logits/rejected": -1.930272102355957, "logps/chosen": -51.5399169921875, "logps/rejected": -62.92639923095703, "loss": 0.24, "rewards/accuracies": 1.0, "rewards/chosen": 4.261445045471191, "rewards/margins": 1.7790391445159912, "rewards/rejected": 2.4824059009552, "step": 8485 }, { "epoch": 1.88, "learning_rate": 9.686194113620762e-08, "logits/chosen": -2.0525193214416504, "logits/rejected": -2.007352113723755, "logps/chosen": -46.55207443237305, "logps/rejected": -41.586936950683594, "loss": 0.4614, "rewards/accuracies": 0.0, "rewards/chosen": 4.310072898864746, "rewards/margins": -0.020807266235351562, "rewards/rejected": 4.330880165100098, "step": 8486 }, { "epoch": 1.88, "learning_rate": 9.651117259540266e-08, "logits/chosen": -2.04253888130188, "logits/rejected": -2.0828113555908203, "logps/chosen": -55.425498962402344, "logps/rejected": -89.88644409179688, "loss": 0.8388, "rewards/accuracies": 0.0, "rewards/chosen": 5.679830074310303, "rewards/margins": -1.4633979797363281, "rewards/rejected": 7.143228054046631, "step": 8487 }, { "epoch": 1.88, "learning_rate": 9.616103414112343e-08, "logits/chosen": -2.175393581390381, "logits/rejected": -2.076263904571533, "logps/chosen": -113.38288116455078, "logps/rejected": -65.71681213378906, "loss": 0.1296, "rewards/accuracies": 1.0, "rewards/chosen": 8.71688461303711, "rewards/margins": 1.3223366737365723, "rewards/rejected": 7.394547939300537, "step": 8488 }, { "epoch": 1.88, "learning_rate": 9.581152581836339e-08, "logits/chosen": -2.0317962169647217, "logits/rejected": -1.9542430639266968, "logps/chosen": -101.51591491699219, "logps/rejected": -53.718143463134766, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 7.185128688812256, "rewards/margins": 3.715013265609741, "rewards/rejected": 3.4701154232025146, "step": 8489 }, { "epoch": 1.88, "learning_rate": 9.546264767203328e-08, "logits/chosen": -1.730701208114624, "logits/rejected": -1.7163102626800537, "logps/chosen": -21.98980712890625, "logps/rejected": -48.99098205566406, "loss": 0.4334, "rewards/accuracies": 0.0, "rewards/chosen": 2.768007755279541, "rewards/margins": -0.10503602027893066, "rewards/rejected": 2.8730437755584717, "step": 8490 }, { "epoch": 1.88, "learning_rate": 9.511439974696224e-08, "logits/chosen": -1.9548014402389526, "logits/rejected": -1.922899842262268, "logps/chosen": -84.35811614990234, "logps/rejected": -82.79257202148438, "loss": 0.1115, "rewards/accuracies": 1.0, "rewards/chosen": 7.4390997886657715, "rewards/margins": 1.4701390266418457, "rewards/rejected": 5.968960762023926, "step": 8491 }, { "epoch": 1.88, "learning_rate": 9.476678208790002e-08, "logits/chosen": -2.102644443511963, "logits/rejected": -2.110006332397461, "logps/chosen": -182.6685333251953, "logps/rejected": -139.65072631835938, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 10.430009841918945, "rewards/margins": 3.1283998489379883, "rewards/rejected": 7.301609992980957, "step": 8492 }, { "epoch": 1.88, "learning_rate": 9.441979473951424e-08, "logits/chosen": -1.8282946348190308, "logits/rejected": -1.7672393321990967, "logps/chosen": -94.36173248291016, "logps/rejected": -65.39251708984375, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": 6.095195770263672, "rewards/margins": 3.6731162071228027, "rewards/rejected": 2.422079563140869, "step": 8493 }, { "epoch": 1.88, "learning_rate": 9.4073437746392e-08, "logits/chosen": -2.0477454662323, "logits/rejected": -2.0023369789123535, "logps/chosen": -35.25664138793945, "logps/rejected": -9.33627700805664, "loss": 0.0879, "rewards/accuracies": 1.0, "rewards/chosen": 3.2010509967803955, "rewards/margins": 1.6499477624893188, "rewards/rejected": 1.5511032342910767, "step": 8494 }, { "epoch": 1.88, "learning_rate": 9.372771115303992e-08, "logits/chosen": -1.7183244228363037, "logits/rejected": -1.6787583827972412, "logps/chosen": -83.2842025756836, "logps/rejected": -41.30418395996094, "loss": 0.182, "rewards/accuracies": 1.0, "rewards/chosen": 6.705147743225098, "rewards/margins": 0.8730630874633789, "rewards/rejected": 5.832084655761719, "step": 8495 }, { "epoch": 1.88, "learning_rate": 9.338261500388301e-08, "logits/chosen": -1.9806690216064453, "logits/rejected": -2.0003652572631836, "logps/chosen": -78.30393981933594, "logps/rejected": -73.9866943359375, "loss": 0.1196, "rewards/accuracies": 1.0, "rewards/chosen": 8.209805488586426, "rewards/margins": 1.97590970993042, "rewards/rejected": 6.233895778656006, "step": 8496 }, { "epoch": 1.88, "learning_rate": 9.303814934326361e-08, "logits/chosen": -2.1148617267608643, "logits/rejected": -2.1304547786712646, "logps/chosen": -30.630821228027344, "logps/rejected": -131.61935424804688, "loss": 1.5978, "rewards/accuracies": 0.0, "rewards/chosen": 5.679607391357422, "rewards/margins": -3.148761749267578, "rewards/rejected": 8.828369140625, "step": 8497 }, { "epoch": 1.88, "learning_rate": 9.269431421544739e-08, "logits/chosen": -1.7997362613677979, "logits/rejected": -1.8049978017807007, "logps/chosen": -47.597328186035156, "logps/rejected": -57.852783203125, "loss": 0.4272, "rewards/accuracies": 1.0, "rewards/chosen": 3.493748426437378, "rewards/margins": 0.018639326095581055, "rewards/rejected": 3.475109100341797, "step": 8498 }, { "epoch": 1.88, "learning_rate": 9.235110966461403e-08, "logits/chosen": -2.1437814235687256, "logits/rejected": -2.1228203773498535, "logps/chosen": -127.09124755859375, "logps/rejected": -87.088134765625, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 9.397381782531738, "rewards/margins": 4.181283473968506, "rewards/rejected": 5.216098308563232, "step": 8499 }, { "epoch": 1.88, "learning_rate": 9.200853573486545e-08, "logits/chosen": -1.7885944843292236, "logits/rejected": -1.757996916770935, "logps/chosen": -55.93985366821289, "logps/rejected": -80.57015228271484, "loss": 0.0884, "rewards/accuracies": 1.0, "rewards/chosen": 3.903690814971924, "rewards/margins": 1.8356640338897705, "rewards/rejected": 2.0680267810821533, "step": 8500 }, { "epoch": 1.88, "learning_rate": 9.166659247022203e-08, "logits/chosen": -2.040428876876831, "logits/rejected": -2.048421621322632, "logps/chosen": -57.725582122802734, "logps/rejected": -61.202999114990234, "loss": 0.2969, "rewards/accuracies": 1.0, "rewards/chosen": 2.4664113521575928, "rewards/margins": 0.21867609024047852, "rewards/rejected": 2.2477352619171143, "step": 8501 }, { "epoch": 1.88, "learning_rate": 9.132527991462193e-08, "logits/chosen": -2.2448554039001465, "logits/rejected": -2.243825912475586, "logps/chosen": -131.03402709960938, "logps/rejected": -146.80657958984375, "loss": 0.3528, "rewards/accuracies": 1.0, "rewards/chosen": 14.054919242858887, "rewards/margins": 4.383176803588867, "rewards/rejected": 9.67174243927002, "step": 8502 }, { "epoch": 1.88, "learning_rate": 9.098459811192285e-08, "logits/chosen": -2.262120246887207, "logits/rejected": -2.2387356758117676, "logps/chosen": -25.2065486907959, "logps/rejected": -65.53081512451172, "loss": 0.4606, "rewards/accuracies": 1.0, "rewards/chosen": 2.912724494934082, "rewards/margins": 1.3621149063110352, "rewards/rejected": 1.5506095886230469, "step": 8503 }, { "epoch": 1.88, "learning_rate": 9.064454710590253e-08, "logits/chosen": -1.825911045074463, "logits/rejected": -1.672756552696228, "logps/chosen": -116.04756164550781, "logps/rejected": -36.7453498840332, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 7.45412015914917, "rewards/margins": 4.855329513549805, "rewards/rejected": 2.5987908840179443, "step": 8504 }, { "epoch": 1.88, "learning_rate": 9.030512694025606e-08, "logits/chosen": -1.7997187376022339, "logits/rejected": -1.7997187376022339, "logps/chosen": -9.816861152648926, "logps/rejected": -9.816861152648926, "loss": 0.3468, "rewards/accuracies": 0.0, "rewards/chosen": 1.1978700160980225, "rewards/margins": 0.0, "rewards/rejected": 1.1978700160980225, "step": 8505 }, { "epoch": 1.88, "learning_rate": 8.996633765859797e-08, "logits/chosen": -2.1809916496276855, "logits/rejected": -2.194495677947998, "logps/chosen": -113.23619842529297, "logps/rejected": -55.91453552246094, "loss": 0.1021, "rewards/accuracies": 1.0, "rewards/chosen": 8.912018775939941, "rewards/margins": 1.669022560119629, "rewards/rejected": 7.2429962158203125, "step": 8506 }, { "epoch": 1.88, "learning_rate": 8.962817930446233e-08, "logits/chosen": -2.326160192489624, "logits/rejected": -2.3321447372436523, "logps/chosen": -136.32994079589844, "logps/rejected": -125.67399597167969, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": 10.811433792114258, "rewards/margins": 3.2222352027893066, "rewards/rejected": 7.589198589324951, "step": 8507 }, { "epoch": 1.88, "learning_rate": 8.92906519213016e-08, "logits/chosen": -1.758236050605774, "logits/rejected": -1.7866804599761963, "logps/chosen": -37.51286315917969, "logps/rejected": -70.25701904296875, "loss": 0.8651, "rewards/accuracies": 1.0, "rewards/chosen": 5.15981912612915, "rewards/margins": 2.5240790843963623, "rewards/rejected": 2.635740041732788, "step": 8508 }, { "epoch": 1.88, "learning_rate": 8.895375555248664e-08, "logits/chosen": -1.8245049715042114, "logits/rejected": -1.7410588264465332, "logps/chosen": -166.18218994140625, "logps/rejected": -55.71830749511719, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": 6.44081449508667, "rewards/margins": 3.2192392349243164, "rewards/rejected": 3.2215752601623535, "step": 8509 }, { "epoch": 1.88, "learning_rate": 8.86174902413084e-08, "logits/chosen": -1.9024779796600342, "logits/rejected": -1.681978464126587, "logps/chosen": -63.540225982666016, "logps/rejected": -108.36528015136719, "loss": 0.207, "rewards/accuracies": 1.0, "rewards/chosen": 5.897680282592773, "rewards/margins": 0.815833568572998, "rewards/rejected": 5.081846714019775, "step": 8510 }, { "epoch": 1.88, "learning_rate": 8.828185603097617e-08, "logits/chosen": -1.8872812986373901, "logits/rejected": -1.8695955276489258, "logps/chosen": -83.37370300292969, "logps/rejected": -153.50543212890625, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 8.311755180358887, "rewards/margins": 3.60807466506958, "rewards/rejected": 4.703680515289307, "step": 8511 }, { "epoch": 1.88, "learning_rate": 8.79468529646177e-08, "logits/chosen": -1.6925724744796753, "logits/rejected": -1.6050183773040771, "logps/chosen": -41.589508056640625, "logps/rejected": -34.32695770263672, "loss": 0.6958, "rewards/accuracies": 1.0, "rewards/chosen": 3.82073974609375, "rewards/margins": 0.32504796981811523, "rewards/rejected": 3.4956917762756348, "step": 8512 }, { "epoch": 1.88, "learning_rate": 8.761248108528075e-08, "logits/chosen": -1.891505479812622, "logits/rejected": -1.7335445880889893, "logps/chosen": -100.98910522460938, "logps/rejected": -44.678016662597656, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": 6.971753120422363, "rewards/margins": 4.400075912475586, "rewards/rejected": 2.5716774463653564, "step": 8513 }, { "epoch": 1.88, "learning_rate": 8.727874043593044e-08, "logits/chosen": -1.899505615234375, "logits/rejected": -1.924089789390564, "logps/chosen": -45.61548614501953, "logps/rejected": -75.2935791015625, "loss": 1.5186, "rewards/accuracies": 0.0, "rewards/chosen": 4.357425689697266, "rewards/margins": -2.830812931060791, "rewards/rejected": 7.188238620758057, "step": 8514 }, { "epoch": 1.88, "learning_rate": 8.694563105945242e-08, "logits/chosen": -1.7811322212219238, "logits/rejected": -1.736131191253662, "logps/chosen": -43.05029296875, "logps/rejected": -39.69468307495117, "loss": 0.2889, "rewards/accuracies": 1.0, "rewards/chosen": 3.0991179943084717, "rewards/margins": 0.2511172294616699, "rewards/rejected": 2.8480007648468018, "step": 8515 }, { "epoch": 1.88, "learning_rate": 8.661315299865025e-08, "logits/chosen": -2.141702175140381, "logits/rejected": -2.088848352432251, "logps/chosen": -103.03443908691406, "logps/rejected": -83.9881591796875, "loss": 0.1462, "rewards/accuracies": 1.0, "rewards/chosen": 4.343696594238281, "rewards/margins": 1.3304214477539062, "rewards/rejected": 3.013275146484375, "step": 8516 }, { "epoch": 1.89, "learning_rate": 8.628130629624643e-08, "logits/chosen": -1.9572861194610596, "logits/rejected": -1.911722183227539, "logps/chosen": -31.960914611816406, "logps/rejected": -24.334583282470703, "loss": 0.3567, "rewards/accuracies": 1.0, "rewards/chosen": 2.1929993629455566, "rewards/margins": 1.2596855163574219, "rewards/rejected": 0.93331378698349, "step": 8517 }, { "epoch": 1.89, "learning_rate": 8.595009099488238e-08, "logits/chosen": -1.621368169784546, "logits/rejected": -1.6332991123199463, "logps/chosen": -60.02202606201172, "logps/rejected": -46.66496276855469, "loss": 0.2812, "rewards/accuracies": 1.0, "rewards/chosen": 2.396632432937622, "rewards/margins": 0.28125452995300293, "rewards/rejected": 2.115377902984619, "step": 8518 }, { "epoch": 1.89, "learning_rate": 8.561950713711909e-08, "logits/chosen": -1.9132487773895264, "logits/rejected": -1.6365597248077393, "logps/chosen": -138.06463623046875, "logps/rejected": -28.447444915771484, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 7.379248142242432, "rewards/margins": 6.924693584442139, "rewards/rejected": 0.45455437898635864, "step": 8519 }, { "epoch": 1.89, "learning_rate": 8.528955476543477e-08, "logits/chosen": -1.8402296304702759, "logits/rejected": -1.7853106260299683, "logps/chosen": -42.4991455078125, "logps/rejected": -20.645353317260742, "loss": 0.1429, "rewards/accuracies": 1.0, "rewards/chosen": 3.09470534324646, "rewards/margins": 1.4605106115341187, "rewards/rejected": 1.6341947317123413, "step": 8520 }, { "epoch": 1.89, "learning_rate": 8.496023392222774e-08, "logits/chosen": -1.872258186340332, "logits/rejected": -1.7690972089767456, "logps/chosen": -89.05077362060547, "logps/rejected": -74.48748016357422, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": 6.7924370765686035, "rewards/margins": 3.4703445434570312, "rewards/rejected": 3.3220925331115723, "step": 8521 }, { "epoch": 1.89, "learning_rate": 8.463154464981582e-08, "logits/chosen": -2.1488585472106934, "logits/rejected": -2.1274430751800537, "logps/chosen": -27.158615112304688, "logps/rejected": -4.84511661529541, "loss": 0.1784, "rewards/accuracies": 1.0, "rewards/chosen": 2.0819900035858154, "rewards/margins": 1.017226219177246, "rewards/rejected": 1.0647637844085693, "step": 8522 }, { "epoch": 1.89, "learning_rate": 8.430348699043356e-08, "logits/chosen": -1.9941977262496948, "logits/rejected": -2.012456178665161, "logps/chosen": -97.93270874023438, "logps/rejected": -159.6853790283203, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 13.275851249694824, "rewards/margins": 4.079339027404785, "rewards/rejected": 9.196512222290039, "step": 8523 }, { "epoch": 1.89, "learning_rate": 8.397606098623667e-08, "logits/chosen": -2.277625322341919, "logits/rejected": -2.2531979084014893, "logps/chosen": -70.29257202148438, "logps/rejected": -67.90863037109375, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": 4.749156951904297, "rewards/margins": 3.4242827892303467, "rewards/rejected": 1.3248741626739502, "step": 8524 }, { "epoch": 1.89, "learning_rate": 8.364926667929818e-08, "logits/chosen": -2.0811305046081543, "logits/rejected": -1.9334675073623657, "logps/chosen": -176.22171020507812, "logps/rejected": -134.41123962402344, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": 9.342129707336426, "rewards/margins": 4.099339485168457, "rewards/rejected": 5.242790222167969, "step": 8525 }, { "epoch": 1.89, "learning_rate": 8.33231041116106e-08, "logits/chosen": -1.9626795053482056, "logits/rejected": -1.958343267440796, "logps/chosen": -24.593921661376953, "logps/rejected": -35.89120864868164, "loss": 0.6208, "rewards/accuracies": 0.0, "rewards/chosen": 2.52135968208313, "rewards/margins": -0.9002876281738281, "rewards/rejected": 3.421647310256958, "step": 8526 }, { "epoch": 1.89, "learning_rate": 8.29975733250843e-08, "logits/chosen": -1.7842868566513062, "logits/rejected": -1.786898136138916, "logps/chosen": -19.774227142333984, "logps/rejected": -54.58879089355469, "loss": 2.4703, "rewards/accuracies": 0.0, "rewards/chosen": 1.9992382526397705, "rewards/margins": -1.8113970756530762, "rewards/rejected": 3.8106353282928467, "step": 8527 }, { "epoch": 1.89, "learning_rate": 8.267267436154968e-08, "logits/chosen": -2.19596004486084, "logits/rejected": -2.1636950969696045, "logps/chosen": -36.74385070800781, "logps/rejected": -46.596397399902344, "loss": 0.4668, "rewards/accuracies": 1.0, "rewards/chosen": 4.9383769035339355, "rewards/margins": 1.0751559734344482, "rewards/rejected": 3.8632209300994873, "step": 8528 }, { "epoch": 1.89, "learning_rate": 8.234840726275561e-08, "logits/chosen": -2.1860787868499756, "logits/rejected": -2.1454360485076904, "logps/chosen": -39.00102615356445, "logps/rejected": -17.43683433532715, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": 2.99059796333313, "rewards/margins": 2.436337947845459, "rewards/rejected": 0.5542598962783813, "step": 8529 }, { "epoch": 1.89, "learning_rate": 8.202477207036986e-08, "logits/chosen": -2.0834672451019287, "logits/rejected": -2.117314338684082, "logps/chosen": -85.41856384277344, "logps/rejected": -94.03433990478516, "loss": 0.7695, "rewards/accuracies": 0.0, "rewards/chosen": 12.108522415161133, "rewards/margins": -1.2514324188232422, "rewards/rejected": 13.359954833984375, "step": 8530 }, { "epoch": 1.89, "learning_rate": 8.170176882597803e-08, "logits/chosen": -2.058228015899658, "logits/rejected": -1.9673399925231934, "logps/chosen": -94.61402893066406, "logps/rejected": -80.260498046875, "loss": 0.2221, "rewards/accuracies": 1.0, "rewards/chosen": 9.568611145019531, "rewards/margins": 0.739842414855957, "rewards/rejected": 8.828768730163574, "step": 8531 }, { "epoch": 1.89, "learning_rate": 8.137939757108526e-08, "logits/chosen": -1.9251070022583008, "logits/rejected": -1.8772776126861572, "logps/chosen": -55.56396484375, "logps/rejected": -54.05612564086914, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": 6.781208038330078, "rewards/margins": 3.0413553714752197, "rewards/rejected": 3.7398526668548584, "step": 8532 }, { "epoch": 1.89, "learning_rate": 8.105765834711676e-08, "logits/chosen": -2.0887959003448486, "logits/rejected": -2.0756702423095703, "logps/chosen": -78.01087188720703, "logps/rejected": -57.74254608154297, "loss": 0.0598, "rewards/accuracies": 1.0, "rewards/chosen": 9.447942733764648, "rewards/margins": 2.070979595184326, "rewards/rejected": 7.376963138580322, "step": 8533 }, { "epoch": 1.89, "learning_rate": 8.073655119541335e-08, "logits/chosen": -1.8315887451171875, "logits/rejected": -1.8315887451171875, "logps/chosen": -3.2347049713134766, "logps/rejected": -3.2347049713134766, "loss": 0.512, "rewards/accuracies": 0.0, "rewards/chosen": 2.053313970565796, "rewards/margins": 0.0, "rewards/rejected": 2.053313970565796, "step": 8534 }, { "epoch": 1.89, "learning_rate": 8.041607615723868e-08, "logits/chosen": -1.8538178205490112, "logits/rejected": -1.8701261281967163, "logps/chosen": -17.046546936035156, "logps/rejected": -145.75392150878906, "loss": 1.3392, "rewards/accuracies": 0.0, "rewards/chosen": 5.226511478424072, "rewards/margins": -2.2584619522094727, "rewards/rejected": 7.484973430633545, "step": 8535 }, { "epoch": 1.89, "learning_rate": 8.009623327377037e-08, "logits/chosen": -2.0542192459106445, "logits/rejected": -1.9278814792633057, "logps/chosen": -75.09605407714844, "logps/rejected": -159.51141357421875, "loss": 0.1994, "rewards/accuracies": 1.0, "rewards/chosen": 9.067804336547852, "rewards/margins": 1.726423740386963, "rewards/rejected": 7.341380596160889, "step": 8536 }, { "epoch": 1.89, "learning_rate": 7.977702258611054e-08, "logits/chosen": -1.8539727926254272, "logits/rejected": -1.8543826341629028, "logps/chosen": -51.281864166259766, "logps/rejected": -30.316003799438477, "loss": 0.1418, "rewards/accuracies": 1.0, "rewards/chosen": 4.419961452484131, "rewards/margins": 1.1702260971069336, "rewards/rejected": 3.2497353553771973, "step": 8537 }, { "epoch": 1.89, "learning_rate": 7.94584441352747e-08, "logits/chosen": -2.056847095489502, "logits/rejected": -2.056847095489502, "logps/chosen": -40.135009765625, "logps/rejected": -40.135009765625, "loss": 0.3987, "rewards/accuracies": 0.0, "rewards/chosen": 2.156471014022827, "rewards/margins": 0.0, "rewards/rejected": 2.156471014022827, "step": 8538 }, { "epoch": 1.89, "learning_rate": 7.914049796220013e-08, "logits/chosen": -1.6753829717636108, "logits/rejected": -1.6753829717636108, "logps/chosen": -41.001495361328125, "logps/rejected": -41.001495361328125, "loss": 0.3748, "rewards/accuracies": 0.0, "rewards/chosen": 2.620898485183716, "rewards/margins": 0.0, "rewards/rejected": 2.620898485183716, "step": 8539 }, { "epoch": 1.89, "learning_rate": 7.882318410774193e-08, "logits/chosen": -1.7601392269134521, "logits/rejected": -1.7586332559585571, "logps/chosen": -34.90303039550781, "logps/rejected": -55.93882751464844, "loss": 1.1079, "rewards/accuracies": 0.0, "rewards/chosen": 5.302302837371826, "rewards/margins": -1.9246931076049805, "rewards/rejected": 7.226995944976807, "step": 8540 }, { "epoch": 1.89, "learning_rate": 7.850650261267468e-08, "logits/chosen": -1.9599372148513794, "logits/rejected": -1.8274179697036743, "logps/chosen": -93.41934204101562, "logps/rejected": -47.672027587890625, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 6.684326171875, "rewards/margins": 4.4853363037109375, "rewards/rejected": 2.1989898681640625, "step": 8541 }, { "epoch": 1.89, "learning_rate": 7.819045351769083e-08, "logits/chosen": -2.0575942993164062, "logits/rejected": -1.916797399520874, "logps/chosen": -80.88018798828125, "logps/rejected": -36.92977523803711, "loss": 0.1284, "rewards/accuracies": 1.0, "rewards/chosen": 5.013001918792725, "rewards/margins": 1.2677552700042725, "rewards/rejected": 3.745246648788452, "step": 8542 }, { "epoch": 1.89, "learning_rate": 7.78750368634018e-08, "logits/chosen": -1.7795143127441406, "logits/rejected": -1.7467049360275269, "logps/chosen": -13.046709060668945, "logps/rejected": -17.092849731445312, "loss": 0.554, "rewards/accuracies": 0.0, "rewards/chosen": 1.884062647819519, "rewards/margins": -0.6152912378311157, "rewards/rejected": 2.4993538856506348, "step": 8543 }, { "epoch": 1.89, "learning_rate": 7.756025269033851e-08, "logits/chosen": -2.0877013206481934, "logits/rejected": -2.0800180435180664, "logps/chosen": -28.5872859954834, "logps/rejected": -13.018577575683594, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 3.7260901927948, "rewards/margins": 3.2215328216552734, "rewards/rejected": 0.5045574307441711, "step": 8544 }, { "epoch": 1.89, "learning_rate": 7.724610103894914e-08, "logits/chosen": -1.6759593486785889, "logits/rejected": -1.6473045349121094, "logps/chosen": -31.67185401916504, "logps/rejected": -66.33755493164062, "loss": 0.6365, "rewards/accuracies": 0.0, "rewards/chosen": 2.9989469051361084, "rewards/margins": -0.5193386077880859, "rewards/rejected": 3.5182855129241943, "step": 8545 }, { "epoch": 1.89, "learning_rate": 7.693258194960252e-08, "logits/chosen": -1.968916893005371, "logits/rejected": -1.9457558393478394, "logps/chosen": -64.48858642578125, "logps/rejected": -65.92288208007812, "loss": 0.0795, "rewards/accuracies": 1.0, "rewards/chosen": 4.63356876373291, "rewards/margins": 2.0168588161468506, "rewards/rejected": 2.6167099475860596, "step": 8546 }, { "epoch": 1.89, "learning_rate": 7.661969546258363e-08, "logits/chosen": -2.00277042388916, "logits/rejected": -1.884103536605835, "logps/chosen": -108.82482147216797, "logps/rejected": -67.17613983154297, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 8.861639976501465, "rewards/margins": 6.512006759643555, "rewards/rejected": 2.349632978439331, "step": 8547 }, { "epoch": 1.89, "learning_rate": 7.630744161809866e-08, "logits/chosen": -1.5886998176574707, "logits/rejected": -1.6446781158447266, "logps/chosen": -7.269947528839111, "logps/rejected": -33.63227462768555, "loss": 1.1995, "rewards/accuracies": 0.0, "rewards/chosen": 1.5602115392684937, "rewards/margins": -1.8381386995315552, "rewards/rejected": 3.398350238800049, "step": 8548 }, { "epoch": 1.89, "learning_rate": 7.599582045627108e-08, "logits/chosen": -1.9926085472106934, "logits/rejected": -1.9733242988586426, "logps/chosen": -85.49064636230469, "logps/rejected": -88.99024963378906, "loss": 0.0833, "rewards/accuracies": 1.0, "rewards/chosen": 9.985468864440918, "rewards/margins": 2.385951042175293, "rewards/rejected": 7.599517822265625, "step": 8549 }, { "epoch": 1.89, "learning_rate": 7.568483201714328e-08, "logits/chosen": -1.927891492843628, "logits/rejected": -1.946424961090088, "logps/chosen": -47.60541534423828, "logps/rejected": -55.430030822753906, "loss": 0.4738, "rewards/accuracies": 0.0, "rewards/chosen": 3.943990468978882, "rewards/margins": -0.4515821933746338, "rewards/rejected": 4.395572662353516, "step": 8550 }, { "epoch": 1.89, "learning_rate": 7.537447634067774e-08, "logits/chosen": -2.2136189937591553, "logits/rejected": -2.2189579010009766, "logps/chosen": -100.84666442871094, "logps/rejected": -156.21588134765625, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": 5.607757568359375, "rewards/margins": 3.1663849353790283, "rewards/rejected": 2.4413726329803467, "step": 8551 }, { "epoch": 1.89, "learning_rate": 7.506475346675258e-08, "logits/chosen": -2.356154203414917, "logits/rejected": -2.3631174564361572, "logps/chosen": -29.83448600769043, "logps/rejected": -101.3095703125, "loss": 1.1487, "rewards/accuracies": 0.0, "rewards/chosen": 5.458963871002197, "rewards/margins": -1.4712915420532227, "rewards/rejected": 6.93025541305542, "step": 8552 }, { "epoch": 1.89, "learning_rate": 7.475566343516816e-08, "logits/chosen": -2.202603340148926, "logits/rejected": -2.202603340148926, "logps/chosen": -45.166988372802734, "logps/rejected": -45.166988372802734, "loss": 0.3478, "rewards/accuracies": 0.0, "rewards/chosen": 4.931978225708008, "rewards/margins": 0.0, "rewards/rejected": 4.931978225708008, "step": 8553 }, { "epoch": 1.89, "learning_rate": 7.444720628564051e-08, "logits/chosen": -1.7287119626998901, "logits/rejected": -1.7022680044174194, "logps/chosen": -25.546772003173828, "logps/rejected": -36.706478118896484, "loss": 0.9402, "rewards/accuracies": 0.0, "rewards/chosen": 4.584544658660889, "rewards/margins": -1.1809206008911133, "rewards/rejected": 5.765465259552002, "step": 8554 }, { "epoch": 1.89, "learning_rate": 7.413938205780679e-08, "logits/chosen": -1.8040447235107422, "logits/rejected": -1.782829999923706, "logps/chosen": -84.45425415039062, "logps/rejected": -50.86347961425781, "loss": 0.1586, "rewards/accuracies": 1.0, "rewards/chosen": 4.404043674468994, "rewards/margins": 1.9229493141174316, "rewards/rejected": 2.4810943603515625, "step": 8555 }, { "epoch": 1.89, "learning_rate": 7.38321907912204e-08, "logits/chosen": -2.0326623916625977, "logits/rejected": -1.9641003608703613, "logps/chosen": -71.5907974243164, "logps/rejected": -44.203575134277344, "loss": 0.2736, "rewards/accuracies": 1.0, "rewards/chosen": 6.951320648193359, "rewards/margins": 3.0291168689727783, "rewards/rejected": 3.922203779220581, "step": 8556 }, { "epoch": 1.89, "learning_rate": 7.352563252535528e-08, "logits/chosen": -1.8664143085479736, "logits/rejected": -1.782159447669983, "logps/chosen": -145.30288696289062, "logps/rejected": -62.651065826416016, "loss": 0.0687, "rewards/accuracies": 1.0, "rewards/chosen": 7.380337715148926, "rewards/margins": 2.106607437133789, "rewards/rejected": 5.273730278015137, "step": 8557 }, { "epoch": 1.89, "learning_rate": 7.321970729960381e-08, "logits/chosen": -1.9276421070098877, "logits/rejected": -1.9266221523284912, "logps/chosen": -173.9707489013672, "logps/rejected": -179.41539001464844, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": 10.646260261535645, "rewards/margins": 3.482186794281006, "rewards/rejected": 7.164073467254639, "step": 8558 }, { "epoch": 1.89, "learning_rate": 7.291441515327625e-08, "logits/chosen": -1.8720299005508423, "logits/rejected": -1.8823083639144897, "logps/chosen": -22.745872497558594, "logps/rejected": -38.11969757080078, "loss": 0.2071, "rewards/accuracies": 1.0, "rewards/chosen": 2.9697468280792236, "rewards/margins": 0.9126458168029785, "rewards/rejected": 2.057101011276245, "step": 8559 }, { "epoch": 1.89, "learning_rate": 7.260975612560173e-08, "logits/chosen": -1.7344201803207397, "logits/rejected": -1.7071961164474487, "logps/chosen": -50.35976028442383, "logps/rejected": -67.75621795654297, "loss": 0.4357, "rewards/accuracies": 0.0, "rewards/chosen": 5.29640007019043, "rewards/margins": -0.3068404197692871, "rewards/rejected": 5.603240489959717, "step": 8560 }, { "epoch": 1.89, "learning_rate": 7.23057302557284e-08, "logits/chosen": -1.8035264015197754, "logits/rejected": -1.7439424991607666, "logps/chosen": -89.03620910644531, "logps/rejected": -61.04317092895508, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": 6.001289367675781, "rewards/margins": 2.906343460083008, "rewards/rejected": 3.0949459075927734, "step": 8561 }, { "epoch": 1.9, "learning_rate": 7.200233758272335e-08, "logits/chosen": -2.098024845123291, "logits/rejected": -2.0651955604553223, "logps/chosen": -47.1381721496582, "logps/rejected": -63.57941436767578, "loss": 0.2591, "rewards/accuracies": 1.0, "rewards/chosen": 4.054736614227295, "rewards/margins": 1.2254235744476318, "rewards/rejected": 2.829313039779663, "step": 8562 }, { "epoch": 1.9, "learning_rate": 7.169957814557094e-08, "logits/chosen": -1.9801044464111328, "logits/rejected": -1.9940667152404785, "logps/chosen": -60.42517852783203, "logps/rejected": -63.947811126708984, "loss": 0.0811, "rewards/accuracies": 1.0, "rewards/chosen": 4.923238277435303, "rewards/margins": 2.0551462173461914, "rewards/rejected": 2.8680920600891113, "step": 8563 }, { "epoch": 1.9, "learning_rate": 7.139745198317561e-08, "logits/chosen": -1.9106203317642212, "logits/rejected": -1.9106203317642212, "logps/chosen": -44.385986328125, "logps/rejected": -44.385986328125, "loss": 0.3563, "rewards/accuracies": 0.0, "rewards/chosen": 3.1413865089416504, "rewards/margins": 0.0, "rewards/rejected": 3.1413865089416504, "step": 8564 }, { "epoch": 1.9, "learning_rate": 7.109595913435962e-08, "logits/chosen": -1.7882280349731445, "logits/rejected": -1.8448920249938965, "logps/chosen": -36.97724914550781, "logps/rejected": -74.14974975585938, "loss": 0.0667, "rewards/accuracies": 1.0, "rewards/chosen": 5.318533420562744, "rewards/margins": 2.1680076122283936, "rewards/rejected": 3.1505258083343506, "step": 8565 }, { "epoch": 1.9, "learning_rate": 7.079509963786424e-08, "logits/chosen": -2.223630905151367, "logits/rejected": -2.180006742477417, "logps/chosen": -36.854270935058594, "logps/rejected": -32.653629302978516, "loss": 0.1642, "rewards/accuracies": 1.0, "rewards/chosen": 4.105116367340088, "rewards/margins": 0.9517505168914795, "rewards/rejected": 3.1533658504486084, "step": 8566 }, { "epoch": 1.9, "learning_rate": 7.049487353234852e-08, "logits/chosen": -2.004054546356201, "logits/rejected": -2.005171060562134, "logps/chosen": -49.60346221923828, "logps/rejected": -35.620384216308594, "loss": 0.315, "rewards/accuracies": 1.0, "rewards/chosen": 5.121256351470947, "rewards/margins": 1.3884148597717285, "rewards/rejected": 3.7328414916992188, "step": 8567 }, { "epoch": 1.9, "learning_rate": 7.01952808563916e-08, "logits/chosen": -2.2094037532806396, "logits/rejected": -2.2265892028808594, "logps/chosen": -73.662841796875, "logps/rejected": -84.32023620605469, "loss": 0.1906, "rewards/accuracies": 1.0, "rewards/chosen": 9.9620361328125, "rewards/margins": 0.7682504653930664, "rewards/rejected": 9.193785667419434, "step": 8568 }, { "epoch": 1.9, "learning_rate": 6.989632164849102e-08, "logits/chosen": -2.080906391143799, "logits/rejected": -2.1279077529907227, "logps/chosen": -60.758758544921875, "logps/rejected": -106.4844741821289, "loss": 0.499, "rewards/accuracies": 0.0, "rewards/chosen": 7.666342258453369, "rewards/margins": -0.37163591384887695, "rewards/rejected": 8.037978172302246, "step": 8569 }, { "epoch": 1.9, "learning_rate": 6.959799594706052e-08, "logits/chosen": -2.2208502292633057, "logits/rejected": -2.2504820823669434, "logps/chosen": -55.150390625, "logps/rejected": -88.42922973632812, "loss": 1.5833, "rewards/accuracies": 0.0, "rewards/chosen": 6.838532447814941, "rewards/margins": -2.982545852661133, "rewards/rejected": 9.821078300476074, "step": 8570 }, { "epoch": 1.9, "learning_rate": 6.930030379043551e-08, "logits/chosen": -2.1779799461364746, "logits/rejected": -2.1814768314361572, "logps/chosen": -67.1600570678711, "logps/rejected": -83.3475112915039, "loss": 0.3844, "rewards/accuracies": 1.0, "rewards/chosen": 6.445546627044678, "rewards/margins": 2.201641082763672, "rewards/rejected": 4.243905544281006, "step": 8571 }, { "epoch": 1.9, "learning_rate": 6.900324521686763e-08, "logits/chosen": -2.091322422027588, "logits/rejected": -2.1097919940948486, "logps/chosen": -42.669891357421875, "logps/rejected": -54.49498748779297, "loss": 0.1966, "rewards/accuracies": 1.0, "rewards/chosen": 6.170731544494629, "rewards/margins": 0.8769898414611816, "rewards/rejected": 5.293741703033447, "step": 8572 }, { "epoch": 1.9, "learning_rate": 6.870682026453023e-08, "logits/chosen": -1.9935232400894165, "logits/rejected": -2.0159549713134766, "logps/chosen": -68.1983871459961, "logps/rejected": -52.901981353759766, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 6.845584869384766, "rewards/margins": 3.161576509475708, "rewards/rejected": 3.6840083599090576, "step": 8573 }, { "epoch": 1.9, "learning_rate": 6.84110289715112e-08, "logits/chosen": -1.819304347038269, "logits/rejected": -1.8706698417663574, "logps/chosen": -57.84675216674805, "logps/rejected": -107.97410583496094, "loss": 1.8922, "rewards/accuracies": 0.0, "rewards/chosen": 7.394218444824219, "rewards/margins": -3.6818790435791016, "rewards/rejected": 11.07609748840332, "step": 8574 }, { "epoch": 1.9, "learning_rate": 6.81158713758201e-08, "logits/chosen": -2.0369813442230225, "logits/rejected": -2.027089834213257, "logps/chosen": -73.734375, "logps/rejected": -72.48282623291016, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 6.763771057128906, "rewards/margins": 4.449640274047852, "rewards/rejected": 2.3141305446624756, "step": 8575 }, { "epoch": 1.9, "learning_rate": 6.782134751538327e-08, "logits/chosen": -1.8822624683380127, "logits/rejected": -1.8858813047409058, "logps/chosen": -56.88513946533203, "logps/rejected": -49.775970458984375, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 3.525412082672119, "rewards/margins": 1.0730736255645752, "rewards/rejected": 2.452338457107544, "step": 8576 }, { "epoch": 1.9, "learning_rate": 6.752745742804656e-08, "logits/chosen": -1.7552850246429443, "logits/rejected": -1.7552850246429443, "logps/chosen": -6.355437755584717, "logps/rejected": -6.355437755584717, "loss": 0.3481, "rewards/accuracies": 0.0, "rewards/chosen": 1.1631792783737183, "rewards/margins": 0.0, "rewards/rejected": 1.1631792783737183, "step": 8577 }, { "epoch": 1.9, "learning_rate": 6.723420115157475e-08, "logits/chosen": -2.123250722885132, "logits/rejected": -2.1336965560913086, "logps/chosen": -52.7009391784668, "logps/rejected": -40.764625549316406, "loss": 0.6097, "rewards/accuracies": 0.0, "rewards/chosen": 4.9239277839660645, "rewards/margins": -0.8427233695983887, "rewards/rejected": 5.766651153564453, "step": 8578 }, { "epoch": 1.9, "learning_rate": 6.694157872364937e-08, "logits/chosen": -2.0610246658325195, "logits/rejected": -2.064680814743042, "logps/chosen": -34.74248123168945, "logps/rejected": -34.34823226928711, "loss": 0.1237, "rewards/accuracies": 1.0, "rewards/chosen": 4.224053859710693, "rewards/margins": 1.2777299880981445, "rewards/rejected": 2.946323871612549, "step": 8579 }, { "epoch": 1.9, "learning_rate": 6.664959018187311e-08, "logits/chosen": -1.9529167413711548, "logits/rejected": -1.9529167413711548, "logps/chosen": -34.950233459472656, "logps/rejected": -34.950233459472656, "loss": 1.0522, "rewards/accuracies": 0.0, "rewards/chosen": 3.0198419094085693, "rewards/margins": 0.0, "rewards/rejected": 3.0198419094085693, "step": 8580 }, { "epoch": 1.9, "learning_rate": 6.635823556376431e-08, "logits/chosen": -2.0004055500030518, "logits/rejected": -1.9623522758483887, "logps/chosen": -92.00593566894531, "logps/rejected": -50.21748733520508, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 6.674867153167725, "rewards/margins": 2.757056713104248, "rewards/rejected": 3.9178104400634766, "step": 8581 }, { "epoch": 1.9, "learning_rate": 6.6067514906763e-08, "logits/chosen": -1.9018571376800537, "logits/rejected": -1.961295485496521, "logps/chosen": -113.72273254394531, "logps/rejected": -181.08033752441406, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": 11.691166877746582, "rewards/margins": 3.7812561988830566, "rewards/rejected": 7.909910678863525, "step": 8582 }, { "epoch": 1.9, "learning_rate": 6.57774282482243e-08, "logits/chosen": -2.0164694786071777, "logits/rejected": -2.0159807205200195, "logps/chosen": -40.80687713623047, "logps/rejected": -22.536733627319336, "loss": 0.1582, "rewards/accuracies": 1.0, "rewards/chosen": 4.657495975494385, "rewards/margins": 1.1543891429901123, "rewards/rejected": 3.5031068325042725, "step": 8583 }, { "epoch": 1.9, "learning_rate": 6.548797562542508e-08, "logits/chosen": -1.8984891176223755, "logits/rejected": -1.872360110282898, "logps/chosen": -40.46381378173828, "logps/rejected": -17.581716537475586, "loss": 0.1103, "rewards/accuracies": 1.0, "rewards/chosen": 3.1578338146209717, "rewards/margins": 1.768424391746521, "rewards/rejected": 1.3894094228744507, "step": 8584 }, { "epoch": 1.9, "learning_rate": 6.519915707555834e-08, "logits/chosen": -1.661884069442749, "logits/rejected": -1.661884069442749, "logps/chosen": -32.99663543701172, "logps/rejected": -32.99663543701172, "loss": 2.0232, "rewards/accuracies": 0.0, "rewards/chosen": 7.236354351043701, "rewards/margins": 0.0, "rewards/rejected": 7.236354351043701, "step": 8585 }, { "epoch": 1.9, "learning_rate": 6.49109726357372e-08, "logits/chosen": -2.068311929702759, "logits/rejected": -2.069751501083374, "logps/chosen": -64.01522064208984, "logps/rejected": -123.41567993164062, "loss": 0.1374, "rewards/accuracies": 1.0, "rewards/chosen": 6.9555792808532715, "rewards/margins": 1.3553886413574219, "rewards/rejected": 5.60019063949585, "step": 8586 }, { "epoch": 1.9, "learning_rate": 6.462342234299258e-08, "logits/chosen": -1.9954943656921387, "logits/rejected": -2.0003762245178223, "logps/chosen": -44.80400085449219, "logps/rejected": -55.06379699707031, "loss": 0.6317, "rewards/accuracies": 0.0, "rewards/chosen": 4.756095886230469, "rewards/margins": -0.9103960990905762, "rewards/rejected": 5.666491985321045, "step": 8587 }, { "epoch": 1.9, "learning_rate": 6.433650623427379e-08, "logits/chosen": -1.9874868392944336, "logits/rejected": -1.8889029026031494, "logps/chosen": -108.75515747070312, "logps/rejected": -65.33108520507812, "loss": 0.083, "rewards/accuracies": 1.0, "rewards/chosen": 5.2755889892578125, "rewards/margins": 2.0835068225860596, "rewards/rejected": 3.192082166671753, "step": 8588 }, { "epoch": 1.9, "learning_rate": 6.405022434644914e-08, "logits/chosen": -2.2484962940216064, "logits/rejected": -2.068016529083252, "logps/chosen": -26.923912048339844, "logps/rejected": -90.43566131591797, "loss": 0.2417, "rewards/accuracies": 1.0, "rewards/chosen": 3.7642159461975098, "rewards/margins": 0.4771561622619629, "rewards/rejected": 3.287059783935547, "step": 8589 }, { "epoch": 1.9, "learning_rate": 6.376457671630421e-08, "logits/chosen": -1.9721192121505737, "logits/rejected": -2.02506947517395, "logps/chosen": -48.53955078125, "logps/rejected": -64.71762084960938, "loss": 0.8318, "rewards/accuracies": 0.0, "rewards/chosen": 6.122661113739014, "rewards/margins": -1.2502026557922363, "rewards/rejected": 7.37286376953125, "step": 8590 }, { "epoch": 1.9, "learning_rate": 6.347956338054572e-08, "logits/chosen": -1.8056236505508423, "logits/rejected": -1.8200699090957642, "logps/chosen": -17.698020935058594, "logps/rejected": -41.72528076171875, "loss": 0.2954, "rewards/accuracies": 1.0, "rewards/chosen": 4.2700114250183105, "rewards/margins": 0.4110407829284668, "rewards/rejected": 3.8589706420898438, "step": 8591 }, { "epoch": 1.9, "learning_rate": 6.319518437579552e-08, "logits/chosen": -2.1858606338500977, "logits/rejected": -2.1839685440063477, "logps/chosen": -65.52359771728516, "logps/rejected": -71.04090881347656, "loss": 0.2159, "rewards/accuracies": 1.0, "rewards/chosen": 4.635617733001709, "rewards/margins": 0.8389036655426025, "rewards/rejected": 3.7967140674591064, "step": 8592 }, { "epoch": 1.9, "learning_rate": 6.291143973859659e-08, "logits/chosen": -1.6833776235580444, "logits/rejected": -1.6424602270126343, "logps/chosen": -53.50634765625, "logps/rejected": -46.636600494384766, "loss": 0.0995, "rewards/accuracies": 1.0, "rewards/chosen": 3.540297031402588, "rewards/margins": 2.533381938934326, "rewards/rejected": 1.0069149732589722, "step": 8593 }, { "epoch": 1.9, "learning_rate": 6.262832950540976e-08, "logits/chosen": -1.7413969039916992, "logits/rejected": -1.7573647499084473, "logps/chosen": -21.824687957763672, "logps/rejected": -76.59053802490234, "loss": 0.5216, "rewards/accuracies": 0.0, "rewards/chosen": 2.731123447418213, "rewards/margins": -0.3495628833770752, "rewards/rejected": 3.080686330795288, "step": 8594 }, { "epoch": 1.9, "learning_rate": 6.23458537126126e-08, "logits/chosen": -2.0961008071899414, "logits/rejected": -2.0754470825195312, "logps/chosen": -42.83957290649414, "logps/rejected": -66.27345275878906, "loss": 0.1621, "rewards/accuracies": 1.0, "rewards/chosen": 3.2621426582336426, "rewards/margins": 1.15226411819458, "rewards/rejected": 2.1098785400390625, "step": 8595 }, { "epoch": 1.9, "learning_rate": 6.206401239650383e-08, "logits/chosen": -1.8098585605621338, "logits/rejected": -1.8098585605621338, "logps/chosen": -47.98725128173828, "logps/rejected": -47.98725128173828, "loss": 0.3957, "rewards/accuracies": 0.0, "rewards/chosen": 5.5871453285217285, "rewards/margins": 0.0, "rewards/rejected": 5.5871453285217285, "step": 8596 }, { "epoch": 1.9, "learning_rate": 6.178280559329953e-08, "logits/chosen": -2.181122064590454, "logits/rejected": -2.120021104812622, "logps/chosen": -92.28978729248047, "logps/rejected": -36.998355865478516, "loss": 0.1057, "rewards/accuracies": 1.0, "rewards/chosen": 5.89941930770874, "rewards/margins": 2.3722918033599854, "rewards/rejected": 3.527127504348755, "step": 8597 }, { "epoch": 1.9, "learning_rate": 6.150223333913296e-08, "logits/chosen": -2.0296218395233154, "logits/rejected": -2.020733594894409, "logps/chosen": -105.7477035522461, "logps/rejected": -89.21964263916016, "loss": 0.1739, "rewards/accuracies": 1.0, "rewards/chosen": 10.320990562438965, "rewards/margins": 1.1577835083007812, "rewards/rejected": 9.163207054138184, "step": 8598 }, { "epoch": 1.9, "learning_rate": 6.122229567005755e-08, "logits/chosen": -2.2175090312957764, "logits/rejected": -2.1749260425567627, "logps/chosen": -140.18075561523438, "logps/rejected": -26.20749282836914, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 8.237625122070312, "rewards/margins": 4.249624252319336, "rewards/rejected": 3.9880008697509766, "step": 8599 }, { "epoch": 1.9, "learning_rate": 6.094299262204506e-08, "logits/chosen": -2.274292230606079, "logits/rejected": -2.2741408348083496, "logps/chosen": -66.41996002197266, "logps/rejected": -170.12831115722656, "loss": 0.1908, "rewards/accuracies": 1.0, "rewards/chosen": 9.30423355102539, "rewards/margins": 0.8190631866455078, "rewards/rejected": 8.485170364379883, "step": 8600 }, { "epoch": 1.9, "learning_rate": 6.066432423098457e-08, "logits/chosen": -1.8330802917480469, "logits/rejected": -1.8920490741729736, "logps/chosen": -29.349973678588867, "logps/rejected": -82.9117202758789, "loss": 1.0974, "rewards/accuracies": 0.0, "rewards/chosen": 4.160735607147217, "rewards/margins": -2.0631651878356934, "rewards/rejected": 6.22390079498291, "step": 8601 }, { "epoch": 1.9, "learning_rate": 6.038629053268464e-08, "logits/chosen": -1.9520530700683594, "logits/rejected": -1.4140805006027222, "logps/chosen": -35.68109130859375, "logps/rejected": -71.63374328613281, "loss": 0.5673, "rewards/accuracies": 0.0, "rewards/chosen": 3.9012954235076904, "rewards/margins": -0.4838845729827881, "rewards/rejected": 4.3851799964904785, "step": 8602 }, { "epoch": 1.9, "learning_rate": 6.01088915628717e-08, "logits/chosen": -1.8183043003082275, "logits/rejected": -1.8524446487426758, "logps/chosen": -43.35142517089844, "logps/rejected": -91.39242553710938, "loss": 0.4778, "rewards/accuracies": 0.0, "rewards/chosen": 5.25563383102417, "rewards/margins": -0.029083251953125, "rewards/rejected": 5.284717082977295, "step": 8603 }, { "epoch": 1.9, "learning_rate": 5.983212735719113e-08, "logits/chosen": -2.2101073265075684, "logits/rejected": -2.2360079288482666, "logps/chosen": -57.865360260009766, "logps/rejected": -118.59869384765625, "loss": 0.0783, "rewards/accuracies": 1.0, "rewards/chosen": 7.339781284332275, "rewards/margins": 1.9528274536132812, "rewards/rejected": 5.386953830718994, "step": 8604 }, { "epoch": 1.9, "learning_rate": 5.95559979512067e-08, "logits/chosen": -2.0540544986724854, "logits/rejected": -2.0342037677764893, "logps/chosen": -110.35652160644531, "logps/rejected": -45.88046646118164, "loss": 0.4519, "rewards/accuracies": 1.0, "rewards/chosen": 6.798867702484131, "rewards/margins": 3.1438815593719482, "rewards/rejected": 3.6549861431121826, "step": 8605 }, { "epoch": 1.9, "learning_rate": 5.9280503380399476e-08, "logits/chosen": -1.8770861625671387, "logits/rejected": -1.691572666168213, "logps/chosen": -29.943437576293945, "logps/rejected": -127.39801788330078, "loss": 0.6159, "rewards/accuracies": 0.0, "rewards/chosen": 4.0191545486450195, "rewards/margins": -0.5045933723449707, "rewards/rejected": 4.52374792098999, "step": 8606 }, { "epoch": 1.91, "learning_rate": 5.900564368017059e-08, "logits/chosen": -1.8844810724258423, "logits/rejected": -1.895369052886963, "logps/chosen": -39.932212829589844, "logps/rejected": -29.79649543762207, "loss": 2.0876, "rewards/accuracies": 0.0, "rewards/chosen": 3.8240106105804443, "rewards/margins": -2.4769294261932373, "rewards/rejected": 6.300940036773682, "step": 8607 }, { "epoch": 1.91, "learning_rate": 5.8731418885839556e-08, "logits/chosen": -1.8357927799224854, "logits/rejected": -1.8487107753753662, "logps/chosen": -30.952102661132812, "logps/rejected": -71.44413757324219, "loss": 0.2495, "rewards/accuracies": 1.0, "rewards/chosen": 5.192623138427734, "rewards/margins": 1.229424238204956, "rewards/rejected": 3.9631989002227783, "step": 8608 }, { "epoch": 1.91, "learning_rate": 5.845782903264152e-08, "logits/chosen": -1.9203709363937378, "logits/rejected": -1.9089759588241577, "logps/chosen": -54.15516662597656, "logps/rejected": -109.58343505859375, "loss": 2.4339, "rewards/accuracies": 1.0, "rewards/chosen": 3.3185882568359375, "rewards/margins": 2.2118821144104004, "rewards/rejected": 1.1067062616348267, "step": 8609 }, { "epoch": 1.91, "learning_rate": 5.818487415573393e-08, "logits/chosen": -1.7593516111373901, "logits/rejected": -1.7165955305099487, "logps/chosen": -33.64341735839844, "logps/rejected": -20.17788314819336, "loss": 0.3285, "rewards/accuracies": 1.0, "rewards/chosen": 2.4348647594451904, "rewards/margins": 0.44037961959838867, "rewards/rejected": 1.9944851398468018, "step": 8610 }, { "epoch": 1.91, "learning_rate": 5.791255429018983e-08, "logits/chosen": -1.9110472202301025, "logits/rejected": -1.9110472202301025, "logps/chosen": -38.59965133666992, "logps/rejected": -38.59965133666992, "loss": 0.4128, "rewards/accuracies": 0.0, "rewards/chosen": 7.551322937011719, "rewards/margins": 0.0, "rewards/rejected": 7.551322937011719, "step": 8611 }, { "epoch": 1.91, "learning_rate": 5.764086947100234e-08, "logits/chosen": -1.9378021955490112, "logits/rejected": -1.552438497543335, "logps/chosen": -39.59223175048828, "logps/rejected": -87.31321716308594, "loss": 0.2797, "rewards/accuracies": 1.0, "rewards/chosen": 6.857614994049072, "rewards/margins": 0.7060599327087402, "rewards/rejected": 6.151555061340332, "step": 8612 }, { "epoch": 1.91, "learning_rate": 5.7369819733081865e-08, "logits/chosen": -1.8627197742462158, "logits/rejected": -1.796582579612732, "logps/chosen": -54.195682525634766, "logps/rejected": -25.53823471069336, "loss": 0.2032, "rewards/accuracies": 1.0, "rewards/chosen": 3.247858762741089, "rewards/margins": 1.5825927257537842, "rewards/rejected": 1.6652660369873047, "step": 8613 }, { "epoch": 1.91, "learning_rate": 5.709940511125778e-08, "logits/chosen": -1.816188931465149, "logits/rejected": -1.6823205947875977, "logps/chosen": -63.77430725097656, "logps/rejected": -98.76090240478516, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": 5.887066841125488, "rewards/margins": 4.815381050109863, "rewards/rejected": 1.071685791015625, "step": 8614 }, { "epoch": 1.91, "learning_rate": 5.682962564027783e-08, "logits/chosen": -1.693205714225769, "logits/rejected": -1.7363089323043823, "logps/chosen": -42.840187072753906, "logps/rejected": -132.06063842773438, "loss": 0.6442, "rewards/accuracies": 1.0, "rewards/chosen": 4.470499515533447, "rewards/margins": 1.4537835121154785, "rewards/rejected": 3.0167160034179688, "step": 8615 }, { "epoch": 1.91, "learning_rate": 5.6560481354807625e-08, "logits/chosen": -1.896012783050537, "logits/rejected": -1.874578833580017, "logps/chosen": -26.722270965576172, "logps/rejected": -21.848567962646484, "loss": 0.3938, "rewards/accuracies": 1.0, "rewards/chosen": 2.828599214553833, "rewards/margins": 0.5722355842590332, "rewards/rejected": 2.2563636302948, "step": 8616 }, { "epoch": 1.91, "learning_rate": 5.6291972289432286e-08, "logits/chosen": -1.8830045461654663, "logits/rejected": -1.9160003662109375, "logps/chosen": -35.12053680419922, "logps/rejected": -85.2852783203125, "loss": 0.5254, "rewards/accuracies": 0.0, "rewards/chosen": 6.987288951873779, "rewards/margins": -0.5854454040527344, "rewards/rejected": 7.572734355926514, "step": 8617 }, { "epoch": 1.91, "learning_rate": 5.6024098478653644e-08, "logits/chosen": -2.075110673904419, "logits/rejected": -1.9681384563446045, "logps/chosen": -129.4498291015625, "logps/rejected": -60.690208435058594, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": 10.670392036437988, "rewards/margins": 7.179537773132324, "rewards/rejected": 3.490854024887085, "step": 8618 }, { "epoch": 1.91, "learning_rate": 5.575685995689417e-08, "logits/chosen": -1.8898861408233643, "logits/rejected": -1.790879487991333, "logps/chosen": -91.82279968261719, "logps/rejected": -84.84690856933594, "loss": 0.0668, "rewards/accuracies": 1.0, "rewards/chosen": 9.495722770690918, "rewards/margins": 6.945806503295898, "rewards/rejected": 2.5499160289764404, "step": 8619 }, { "epoch": 1.91, "learning_rate": 5.5490256758491956e-08, "logits/chosen": -1.8788995742797852, "logits/rejected": -1.7960976362228394, "logps/chosen": -52.04271697998047, "logps/rejected": -38.43629455566406, "loss": 0.1417, "rewards/accuracies": 1.0, "rewards/chosen": 4.09576940536499, "rewards/margins": 1.181149959564209, "rewards/rejected": 2.9146194458007812, "step": 8620 }, { "epoch": 1.91, "learning_rate": 5.52242889177057e-08, "logits/chosen": -1.8985881805419922, "logits/rejected": -1.763807773590088, "logps/chosen": -102.6362075805664, "logps/rejected": -143.5964813232422, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 7.6206231117248535, "rewards/margins": 3.6791024208068848, "rewards/rejected": 3.9415206909179688, "step": 8621 }, { "epoch": 1.91, "learning_rate": 5.4958956468711414e-08, "logits/chosen": -2.2349822521209717, "logits/rejected": -2.209057569503784, "logps/chosen": -119.63848876953125, "logps/rejected": -72.66741943359375, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": 8.780306816101074, "rewards/margins": 3.016631603240967, "rewards/rejected": 5.763675212860107, "step": 8622 }, { "epoch": 1.91, "learning_rate": 5.469425944560347e-08, "logits/chosen": -1.9014877080917358, "logits/rejected": -1.919870138168335, "logps/chosen": -62.662235260009766, "logps/rejected": -108.66795349121094, "loss": 0.4366, "rewards/accuracies": 0.0, "rewards/chosen": 5.940566062927246, "rewards/margins": -0.27245140075683594, "rewards/rejected": 6.213017463684082, "step": 8623 }, { "epoch": 1.91, "learning_rate": 5.443019788239523e-08, "logits/chosen": -2.219187021255493, "logits/rejected": -2.2356221675872803, "logps/chosen": -37.83589172363281, "logps/rejected": -157.838623046875, "loss": 0.2121, "rewards/accuracies": 1.0, "rewards/chosen": 5.351348876953125, "rewards/margins": 0.7312588691711426, "rewards/rejected": 4.620090007781982, "step": 8624 }, { "epoch": 1.91, "learning_rate": 5.4166771813017885e-08, "logits/chosen": -1.6762627363204956, "logits/rejected": -0.8841769695281982, "logps/chosen": -28.982437133789062, "logps/rejected": -30.948774337768555, "loss": 0.6035, "rewards/accuracies": 1.0, "rewards/chosen": 2.505753755569458, "rewards/margins": 1.5962587594985962, "rewards/rejected": 0.9094949960708618, "step": 8625 }, { "epoch": 1.91, "learning_rate": 5.3903981271320463e-08, "logits/chosen": -1.7514612674713135, "logits/rejected": -1.6757160425186157, "logps/chosen": -40.76634979248047, "logps/rejected": -24.2554931640625, "loss": 0.3861, "rewards/accuracies": 1.0, "rewards/chosen": 2.7635879516601562, "rewards/margins": 0.5107276439666748, "rewards/rejected": 2.2528603076934814, "step": 8626 }, { "epoch": 1.91, "learning_rate": 5.3641826291071506e-08, "logits/chosen": -1.8329435586929321, "logits/rejected": -1.7998175621032715, "logps/chosen": -22.50299644470215, "logps/rejected": -25.58188247680664, "loss": 0.7659, "rewards/accuracies": 0.0, "rewards/chosen": 3.4557502269744873, "rewards/margins": -0.9261386394500732, "rewards/rejected": 4.3818888664245605, "step": 8627 }, { "epoch": 1.91, "learning_rate": 5.3380306905957413e-08, "logits/chosen": -1.7506749629974365, "logits/rejected": -1.8226792812347412, "logps/chosen": -38.01848602294922, "logps/rejected": -51.29003143310547, "loss": 1.9917, "rewards/accuracies": 0.0, "rewards/chosen": 2.3505916595458984, "rewards/margins": -3.936521053314209, "rewards/rejected": 6.287112712860107, "step": 8628 }, { "epoch": 1.91, "learning_rate": 5.3119423149582405e-08, "logits/chosen": -2.0099332332611084, "logits/rejected": -2.036006212234497, "logps/chosen": -102.10626220703125, "logps/rejected": -185.57339477539062, "loss": 0.143, "rewards/accuracies": 1.0, "rewards/chosen": 10.099893569946289, "rewards/margins": 1.2158451080322266, "rewards/rejected": 8.884048461914062, "step": 8629 }, { "epoch": 1.91, "learning_rate": 5.285917505546967e-08, "logits/chosen": -1.877108097076416, "logits/rejected": -1.7714786529541016, "logps/chosen": -132.15451049804688, "logps/rejected": -90.81144714355469, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 9.234857559204102, "rewards/margins": 2.9020676612854004, "rewards/rejected": 6.332789897918701, "step": 8630 }, { "epoch": 1.91, "learning_rate": 5.259956265705968e-08, "logits/chosen": -1.9021204710006714, "logits/rejected": -1.7949905395507812, "logps/chosen": -104.672119140625, "logps/rejected": -63.40357971191406, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": 3.893881320953369, "rewards/margins": 2.922146797180176, "rewards/rejected": 0.9717346429824829, "step": 8631 }, { "epoch": 1.91, "learning_rate": 5.234058598771352e-08, "logits/chosen": -1.7408716678619385, "logits/rejected": -1.6338022947311401, "logps/chosen": -33.4423942565918, "logps/rejected": -22.011066436767578, "loss": 0.7752, "rewards/accuracies": 1.0, "rewards/chosen": 3.54400897026062, "rewards/margins": 1.5445895195007324, "rewards/rejected": 1.9994194507598877, "step": 8632 }, { "epoch": 1.91, "learning_rate": 5.208224508070736e-08, "logits/chosen": -1.9580576419830322, "logits/rejected": -1.9580576419830322, "logps/chosen": -29.92238998413086, "logps/rejected": -29.92238998413086, "loss": 0.7527, "rewards/accuracies": 0.0, "rewards/chosen": 4.07111930847168, "rewards/margins": 0.0, "rewards/rejected": 4.07111930847168, "step": 8633 }, { "epoch": 1.91, "learning_rate": 5.182453996923909e-08, "logits/chosen": -2.096212863922119, "logits/rejected": -2.1557090282440186, "logps/chosen": -80.62088012695312, "logps/rejected": -139.8326416015625, "loss": 2.2386, "rewards/accuracies": 0.0, "rewards/chosen": 8.901165962219238, "rewards/margins": -4.448773384094238, "rewards/rejected": 13.349939346313477, "step": 8634 }, { "epoch": 1.91, "learning_rate": 5.1567470686422205e-08, "logits/chosen": -2.0259618759155273, "logits/rejected": -1.9462099075317383, "logps/chosen": -55.116424560546875, "logps/rejected": -30.2667236328125, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 7.0179009437561035, "rewards/margins": 1.878859043121338, "rewards/rejected": 5.139041900634766, "step": 8635 }, { "epoch": 1.91, "learning_rate": 5.131103726528919e-08, "logits/chosen": -1.8851038217544556, "logits/rejected": -1.8443818092346191, "logps/chosen": -69.73574829101562, "logps/rejected": -41.430145263671875, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": 5.683513641357422, "rewards/margins": 2.0911941528320312, "rewards/rejected": 3.5923194885253906, "step": 8636 }, { "epoch": 1.91, "learning_rate": 5.1055239738792006e-08, "logits/chosen": -1.9093049764633179, "logits/rejected": -1.894882321357727, "logps/chosen": -51.22624206542969, "logps/rejected": -63.180118560791016, "loss": 0.5914, "rewards/accuracies": 0.0, "rewards/chosen": 3.537945508956909, "rewards/margins": -0.5766003131866455, "rewards/rejected": 4.114545822143555, "step": 8637 }, { "epoch": 1.91, "learning_rate": 5.0800078139798814e-08, "logits/chosen": -2.1523022651672363, "logits/rejected": -2.187936305999756, "logps/chosen": -54.036434173583984, "logps/rejected": -126.55585479736328, "loss": 0.2543, "rewards/accuracies": 1.0, "rewards/chosen": 9.070042610168457, "rewards/margins": 0.42722511291503906, "rewards/rejected": 8.642817497253418, "step": 8638 }, { "epoch": 1.91, "learning_rate": 5.054555250109894e-08, "logits/chosen": -1.8931487798690796, "logits/rejected": -1.9185312986373901, "logps/chosen": -56.738807678222656, "logps/rejected": -100.6987533569336, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": 4.12447452545166, "rewards/margins": 2.087146043777466, "rewards/rejected": 2.0373284816741943, "step": 8639 }, { "epoch": 1.91, "learning_rate": 5.029166285539677e-08, "logits/chosen": -2.123896360397339, "logits/rejected": -2.0659806728363037, "logps/chosen": -66.59129333496094, "logps/rejected": -75.72908782958984, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 4.889470100402832, "rewards/margins": 2.8304200172424316, "rewards/rejected": 2.0590500831604004, "step": 8640 }, { "epoch": 1.91, "learning_rate": 5.0038409235317885e-08, "logits/chosen": -1.9118285179138184, "logits/rejected": -1.8671380281448364, "logps/chosen": -41.921844482421875, "logps/rejected": -52.617645263671875, "loss": 0.2555, "rewards/accuracies": 1.0, "rewards/chosen": 4.602765083312988, "rewards/margins": 0.6193208694458008, "rewards/rejected": 3.9834442138671875, "step": 8641 }, { "epoch": 1.91, "learning_rate": 4.978579167340347e-08, "logits/chosen": -1.9818973541259766, "logits/rejected": -1.9573147296905518, "logps/chosen": -55.920745849609375, "logps/rejected": -38.00611114501953, "loss": 0.2172, "rewards/accuracies": 1.0, "rewards/chosen": 5.117049694061279, "rewards/margins": 1.2860743999481201, "rewards/rejected": 3.830975294113159, "step": 8642 }, { "epoch": 1.91, "learning_rate": 4.9533810202114784e-08, "logits/chosen": -1.9051891565322876, "logits/rejected": -1.9135627746582031, "logps/chosen": -68.61814880371094, "logps/rejected": -59.483665466308594, "loss": 0.1378, "rewards/accuracies": 1.0, "rewards/chosen": 3.6080315113067627, "rewards/margins": 1.161848545074463, "rewards/rejected": 2.4461829662323, "step": 8643 }, { "epoch": 1.91, "learning_rate": 4.928246485383148e-08, "logits/chosen": -1.9126838445663452, "logits/rejected": -2.0099985599517822, "logps/chosen": -28.981447219848633, "logps/rejected": -88.12403106689453, "loss": 2.9137, "rewards/accuracies": 0.0, "rewards/chosen": 5.363460063934326, "rewards/margins": -4.860776424407959, "rewards/rejected": 10.224236488342285, "step": 8644 }, { "epoch": 1.91, "learning_rate": 4.903175566084939e-08, "logits/chosen": -1.7853938341140747, "logits/rejected": -1.7853938341140747, "logps/chosen": -41.96356201171875, "logps/rejected": -41.96356201171875, "loss": 0.5274, "rewards/accuracies": 0.0, "rewards/chosen": 4.1393022537231445, "rewards/margins": 0.0, "rewards/rejected": 4.1393022537231445, "step": 8645 }, { "epoch": 1.91, "learning_rate": 4.878168265538552e-08, "logits/chosen": -1.8838245868682861, "logits/rejected": -1.8015865087509155, "logps/chosen": -51.920196533203125, "logps/rejected": -36.17931365966797, "loss": 0.1271, "rewards/accuracies": 1.0, "rewards/chosen": 7.232468605041504, "rewards/margins": 2.3899035453796387, "rewards/rejected": 4.842565059661865, "step": 8646 }, { "epoch": 1.91, "learning_rate": 4.8532245869572503e-08, "logits/chosen": -1.86406672000885, "logits/rejected": -1.7018864154815674, "logps/chosen": -83.42386627197266, "logps/rejected": -35.04494857788086, "loss": 0.0622, "rewards/accuracies": 1.0, "rewards/chosen": 5.6093316078186035, "rewards/margins": 2.1917378902435303, "rewards/rejected": 3.4175937175750732, "step": 8647 }, { "epoch": 1.91, "learning_rate": 4.828344533546358e-08, "logits/chosen": -1.8960754871368408, "logits/rejected": -1.7875458002090454, "logps/chosen": -82.67259216308594, "logps/rejected": -54.264305114746094, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 7.149547100067139, "rewards/margins": 4.181628227233887, "rewards/rejected": 2.967919111251831, "step": 8648 }, { "epoch": 1.91, "learning_rate": 4.80352810850282e-08, "logits/chosen": -1.931453824043274, "logits/rejected": -1.8815840482711792, "logps/chosen": -109.8603286743164, "logps/rejected": -70.14823913574219, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 5.538973331451416, "rewards/margins": 2.844947099685669, "rewards/rejected": 2.694026231765747, "step": 8649 }, { "epoch": 1.91, "learning_rate": 4.7787753150154717e-08, "logits/chosen": -2.086617946624756, "logits/rejected": -2.0446269512176514, "logps/chosen": -61.04716873168945, "logps/rejected": -76.71694946289062, "loss": 0.0868, "rewards/accuracies": 1.0, "rewards/chosen": 7.822947978973389, "rewards/margins": 1.7481660842895508, "rewards/rejected": 6.074781894683838, "step": 8650 }, { "epoch": 1.91, "learning_rate": 4.754086156264992e-08, "logits/chosen": -1.9954913854599, "logits/rejected": -1.9010001420974731, "logps/chosen": -54.69486618041992, "logps/rejected": -55.81241989135742, "loss": 1.2225, "rewards/accuracies": 1.0, "rewards/chosen": 4.950928211212158, "rewards/margins": 2.1421616077423096, "rewards/rejected": 2.8087666034698486, "step": 8651 }, { "epoch": 1.92, "learning_rate": 4.729460635423955e-08, "logits/chosen": -1.9263774156570435, "logits/rejected": -1.8946157693862915, "logps/chosen": -175.18521118164062, "logps/rejected": -54.12052917480469, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 7.417715549468994, "rewards/margins": 2.207980155944824, "rewards/rejected": 5.20973539352417, "step": 8652 }, { "epoch": 1.92, "learning_rate": 4.704898755656606e-08, "logits/chosen": -1.4928535223007202, "logits/rejected": -1.3364837169647217, "logps/chosen": -42.540069580078125, "logps/rejected": -4.678897380828857, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": 2.717294454574585, "rewards/margins": 1.838150143623352, "rewards/rejected": 0.8791443109512329, "step": 8653 }, { "epoch": 1.92, "learning_rate": 4.6804005201190884e-08, "logits/chosen": -1.8469403982162476, "logits/rejected": -1.8566620349884033, "logps/chosen": -20.339733123779297, "logps/rejected": -61.99495315551758, "loss": 0.5918, "rewards/accuracies": 0.0, "rewards/chosen": 3.2936654090881348, "rewards/margins": -0.8167605400085449, "rewards/rejected": 4.11042594909668, "step": 8654 }, { "epoch": 1.92, "learning_rate": 4.655965931959439e-08, "logits/chosen": -2.2473740577697754, "logits/rejected": -2.269951581954956, "logps/chosen": -99.63504791259766, "logps/rejected": -101.92037963867188, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 11.106130599975586, "rewards/margins": 7.2647318840026855, "rewards/rejected": 3.8413987159729004, "step": 8655 }, { "epoch": 1.92, "learning_rate": 4.631594994317368e-08, "logits/chosen": -1.9159605503082275, "logits/rejected": -1.9007987976074219, "logps/chosen": -39.99421691894531, "logps/rejected": -67.27072143554688, "loss": 0.552, "rewards/accuracies": 1.0, "rewards/chosen": 3.738497257232666, "rewards/margins": 1.5243890285491943, "rewards/rejected": 2.2141082286834717, "step": 8656 }, { "epoch": 1.92, "learning_rate": 4.607287710324537e-08, "logits/chosen": -1.6426117420196533, "logits/rejected": -1.6426117420196533, "logps/chosen": -23.710695266723633, "logps/rejected": -23.710695266723633, "loss": 1.4613, "rewards/accuracies": 0.0, "rewards/chosen": 4.575844764709473, "rewards/margins": 0.0, "rewards/rejected": 4.575844764709473, "step": 8657 }, { "epoch": 1.92, "learning_rate": 4.583044083104282e-08, "logits/chosen": -1.9081846475601196, "logits/rejected": -1.8501496315002441, "logps/chosen": -60.07277297973633, "logps/rejected": -125.11167907714844, "loss": 1.7369, "rewards/accuracies": 0.0, "rewards/chosen": 6.055108547210693, "rewards/margins": -3.351609706878662, "rewards/rejected": 9.406718254089355, "step": 8658 }, { "epoch": 1.92, "learning_rate": 4.558864115771944e-08, "logits/chosen": -1.6585850715637207, "logits/rejected": -1.7044429779052734, "logps/chosen": -40.68402862548828, "logps/rejected": -56.73255157470703, "loss": 0.3073, "rewards/accuracies": 1.0, "rewards/chosen": 3.6907525062561035, "rewards/margins": 0.20299839973449707, "rewards/rejected": 3.4877541065216064, "step": 8659 }, { "epoch": 1.92, "learning_rate": 4.534747811434592e-08, "logits/chosen": -2.2071533203125, "logits/rejected": -2.2196948528289795, "logps/chosen": -139.71356201171875, "logps/rejected": -59.16714859008789, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 9.490951538085938, "rewards/margins": 5.4725117683410645, "rewards/rejected": 4.018439769744873, "step": 8660 }, { "epoch": 1.92, "learning_rate": 4.5106951731910267e-08, "logits/chosen": -1.9119818210601807, "logits/rejected": -1.9095770120620728, "logps/chosen": -44.56733703613281, "logps/rejected": -50.591529846191406, "loss": 0.2473, "rewards/accuracies": 1.0, "rewards/chosen": 2.965139150619507, "rewards/margins": 0.48519372940063477, "rewards/rejected": 2.479945421218872, "step": 8661 }, { "epoch": 1.92, "learning_rate": 4.486706204132052e-08, "logits/chosen": -2.0308010578155518, "logits/rejected": -2.0365490913391113, "logps/chosen": -26.215648651123047, "logps/rejected": -62.285247802734375, "loss": 1.3988, "rewards/accuracies": 1.0, "rewards/chosen": 3.587517499923706, "rewards/margins": 0.25800085067749023, "rewards/rejected": 3.329516649246216, "step": 8662 }, { "epoch": 1.92, "learning_rate": 4.462780907340092e-08, "logits/chosen": -1.820205569267273, "logits/rejected": -1.768191933631897, "logps/chosen": -69.41950988769531, "logps/rejected": -61.71551513671875, "loss": 0.1299, "rewards/accuracies": 1.0, "rewards/chosen": 4.257499694824219, "rewards/margins": 1.4679863452911377, "rewards/rejected": 2.789513349533081, "step": 8663 }, { "epoch": 1.92, "learning_rate": 4.4389192858896315e-08, "logits/chosen": -2.035310983657837, "logits/rejected": -1.971355676651001, "logps/chosen": -112.36963653564453, "logps/rejected": -91.99720764160156, "loss": 0.3572, "rewards/accuracies": 1.0, "rewards/chosen": 5.567432403564453, "rewards/margins": 3.8412046432495117, "rewards/rejected": 1.7262276411056519, "step": 8664 }, { "epoch": 1.92, "learning_rate": 4.415121342846662e-08, "logits/chosen": -2.160320997238159, "logits/rejected": -2.1255581378936768, "logps/chosen": -46.1543083190918, "logps/rejected": -43.05854797363281, "loss": 0.1189, "rewards/accuracies": 1.0, "rewards/chosen": 3.8934972286224365, "rewards/margins": 1.5897221565246582, "rewards/rejected": 2.3037750720977783, "step": 8665 }, { "epoch": 1.92, "learning_rate": 4.391387081269294e-08, "logits/chosen": -1.8011486530303955, "logits/rejected": -1.7826738357543945, "logps/chosen": -20.984943389892578, "logps/rejected": -19.63585662841797, "loss": 0.4207, "rewards/accuracies": 1.0, "rewards/chosen": 2.6445109844207764, "rewards/margins": 0.235260009765625, "rewards/rejected": 2.4092509746551514, "step": 8666 }, { "epoch": 1.92, "learning_rate": 4.3677165042072534e-08, "logits/chosen": -2.1274893283843994, "logits/rejected": -2.1777563095092773, "logps/chosen": -46.92153549194336, "logps/rejected": -96.98432159423828, "loss": 1.7331, "rewards/accuracies": 0.0, "rewards/chosen": 5.866252422332764, "rewards/margins": -1.5597476959228516, "rewards/rejected": 7.426000118255615, "step": 8667 }, { "epoch": 1.92, "learning_rate": 4.344109614702219e-08, "logits/chosen": -1.931879997253418, "logits/rejected": -1.913329005241394, "logps/chosen": -38.45506286621094, "logps/rejected": -59.985595703125, "loss": 0.4458, "rewards/accuracies": 1.0, "rewards/chosen": 4.570901393890381, "rewards/margins": 0.42595338821411133, "rewards/rejected": 4.1449480056762695, "step": 8668 }, { "epoch": 1.92, "learning_rate": 4.3205664157875416e-08, "logits/chosen": -1.827296495437622, "logits/rejected": -1.5977323055267334, "logps/chosen": -34.643917083740234, "logps/rejected": -56.03561782836914, "loss": 0.2281, "rewards/accuracies": 1.0, "rewards/chosen": 3.682217836380005, "rewards/margins": 0.8719954490661621, "rewards/rejected": 2.8102223873138428, "step": 8669 }, { "epoch": 1.92, "learning_rate": 4.2970869104884685e-08, "logits/chosen": -2.254181385040283, "logits/rejected": -2.16886305809021, "logps/chosen": -99.98605346679688, "logps/rejected": -131.663818359375, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 8.240155220031738, "rewards/margins": 2.8079848289489746, "rewards/rejected": 5.432170391082764, "step": 8670 }, { "epoch": 1.92, "learning_rate": 4.273671101822141e-08, "logits/chosen": -2.1699130535125732, "logits/rejected": -2.1663310527801514, "logps/chosen": -114.83078002929688, "logps/rejected": -191.0720672607422, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 9.94750690460205, "rewards/margins": 3.6753554344177246, "rewards/rejected": 6.272151470184326, "step": 8671 }, { "epoch": 1.92, "learning_rate": 4.250318992797375e-08, "logits/chosen": -1.8796941041946411, "logits/rejected": -1.8878732919692993, "logps/chosen": -32.1872673034668, "logps/rejected": -75.1855239868164, "loss": 1.9496, "rewards/accuracies": 0.0, "rewards/chosen": 3.4755101203918457, "rewards/margins": -3.3992419242858887, "rewards/rejected": 6.874752044677734, "step": 8672 }, { "epoch": 1.92, "learning_rate": 4.227030586414882e-08, "logits/chosen": -2.0229456424713135, "logits/rejected": -2.0223333835601807, "logps/chosen": -153.87266540527344, "logps/rejected": -116.89596557617188, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 11.168647766113281, "rewards/margins": 3.5722532272338867, "rewards/rejected": 7.5963945388793945, "step": 8673 }, { "epoch": 1.92, "learning_rate": 4.2038058856671005e-08, "logits/chosen": -2.1321780681610107, "logits/rejected": -2.0664310455322266, "logps/chosen": -45.52735137939453, "logps/rejected": -49.81389617919922, "loss": 0.0994, "rewards/accuracies": 1.0, "rewards/chosen": 4.474198341369629, "rewards/margins": 1.69181227684021, "rewards/rejected": 2.782386064529419, "step": 8674 }, { "epoch": 1.92, "learning_rate": 4.1806448935384215e-08, "logits/chosen": -1.7446372509002686, "logits/rejected": -1.723430871963501, "logps/chosen": -56.842140197753906, "logps/rejected": -51.39748764038086, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 3.813753604888916, "rewards/margins": 0.6012082099914551, "rewards/rejected": 3.212545394897461, "step": 8675 }, { "epoch": 1.92, "learning_rate": 4.157547613004964e-08, "logits/chosen": -2.051997423171997, "logits/rejected": -2.060018539428711, "logps/chosen": -65.94039916992188, "logps/rejected": -66.65499114990234, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": 6.65957498550415, "rewards/margins": 2.6563034057617188, "rewards/rejected": 4.003271579742432, "step": 8676 }, { "epoch": 1.92, "learning_rate": 4.1345140470346876e-08, "logits/chosen": -1.6500331163406372, "logits/rejected": -1.563934087753296, "logps/chosen": -74.38865661621094, "logps/rejected": -49.855167388916016, "loss": 0.2983, "rewards/accuracies": 1.0, "rewards/chosen": 6.070333957672119, "rewards/margins": 0.4523196220397949, "rewards/rejected": 5.618014335632324, "step": 8677 }, { "epoch": 1.92, "learning_rate": 4.111544198587336e-08, "logits/chosen": -2.0397119522094727, "logits/rejected": -2.0241355895996094, "logps/chosen": -60.656524658203125, "logps/rejected": -82.09600830078125, "loss": 0.1261, "rewards/accuracies": 1.0, "rewards/chosen": 4.826627254486084, "rewards/margins": 2.111520290374756, "rewards/rejected": 2.715106964111328, "step": 8678 }, { "epoch": 1.92, "learning_rate": 4.088638070614437e-08, "logits/chosen": -1.869858741760254, "logits/rejected": -1.8159446716308594, "logps/chosen": -68.18580627441406, "logps/rejected": -36.25292205810547, "loss": 0.4989, "rewards/accuracies": 1.0, "rewards/chosen": 4.858990669250488, "rewards/margins": 1.032193899154663, "rewards/rejected": 3.826796770095825, "step": 8679 }, { "epoch": 1.92, "learning_rate": 4.065795666059413e-08, "logits/chosen": -2.2389843463897705, "logits/rejected": -2.2039670944213867, "logps/chosen": -78.3839111328125, "logps/rejected": -92.75846099853516, "loss": 0.3128, "rewards/accuracies": 1.0, "rewards/chosen": 7.4248199462890625, "rewards/margins": 5.028372764587402, "rewards/rejected": 2.396446943283081, "step": 8680 }, { "epoch": 1.92, "learning_rate": 4.0430169878574174e-08, "logits/chosen": -1.9430809020996094, "logits/rejected": -1.6262297630310059, "logps/chosen": -64.31270599365234, "logps/rejected": -65.48788452148438, "loss": 1.019, "rewards/accuracies": 0.0, "rewards/chosen": 5.131237030029297, "rewards/margins": -0.16597700119018555, "rewards/rejected": 5.297214031219482, "step": 8681 }, { "epoch": 1.92, "learning_rate": 4.020302038935553e-08, "logits/chosen": -1.7542157173156738, "logits/rejected": -1.636906623840332, "logps/chosen": -46.659175872802734, "logps/rejected": -4.449932098388672, "loss": 0.069, "rewards/accuracies": 1.0, "rewards/chosen": 4.771018028259277, "rewards/margins": 3.733768939971924, "rewards/rejected": 1.0372490882873535, "step": 8682 }, { "epoch": 1.92, "learning_rate": 3.9976508222125956e-08, "logits/chosen": -2.3149313926696777, "logits/rejected": -2.3039348125457764, "logps/chosen": -34.2027702331543, "logps/rejected": -92.08872985839844, "loss": 0.3294, "rewards/accuracies": 1.0, "rewards/chosen": 6.326486587524414, "rewards/margins": 0.11784839630126953, "rewards/rejected": 6.2086381912231445, "step": 8683 }, { "epoch": 1.92, "learning_rate": 3.975063340599106e-08, "logits/chosen": -1.8004063367843628, "logits/rejected": -1.8020607233047485, "logps/chosen": -33.114768981933594, "logps/rejected": -33.89153289794922, "loss": 0.2809, "rewards/accuracies": 1.0, "rewards/chosen": 3.6339271068573, "rewards/margins": 0.3883330821990967, "rewards/rejected": 3.245594024658203, "step": 8684 }, { "epoch": 1.92, "learning_rate": 3.95253959699754e-08, "logits/chosen": -2.2481751441955566, "logits/rejected": -2.1573867797851562, "logps/chosen": -96.49440002441406, "logps/rejected": -16.81342124938965, "loss": 0.1849, "rewards/accuracies": 1.0, "rewards/chosen": 4.554154872894287, "rewards/margins": 2.1757495403289795, "rewards/rejected": 2.3784053325653076, "step": 8685 }, { "epoch": 1.92, "learning_rate": 3.9300795943021943e-08, "logits/chosen": -1.8747342824935913, "logits/rejected": -1.9162551164627075, "logps/chosen": -11.368572235107422, "logps/rejected": -26.082067489624023, "loss": 1.4119, "rewards/accuracies": 0.0, "rewards/chosen": 1.803435206413269, "rewards/margins": -2.2983827590942383, "rewards/rejected": 4.101818084716797, "step": 8686 }, { "epoch": 1.92, "learning_rate": 3.907683335399093e-08, "logits/chosen": -2.2521374225616455, "logits/rejected": -2.242982864379883, "logps/chosen": -102.38397216796875, "logps/rejected": -110.27001953125, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 7.070913791656494, "rewards/margins": 2.9695053100585938, "rewards/rejected": 4.1014084815979, "step": 8687 }, { "epoch": 1.92, "learning_rate": 3.8853508231661566e-08, "logits/chosen": -1.9479267597198486, "logits/rejected": -1.8955398797988892, "logps/chosen": -30.1317195892334, "logps/rejected": -75.07694244384766, "loss": 0.5935, "rewards/accuracies": 0.0, "rewards/chosen": 3.658698320388794, "rewards/margins": -0.6173684597015381, "rewards/rejected": 4.276066780090332, "step": 8688 }, { "epoch": 1.92, "learning_rate": 3.863082060472978e-08, "logits/chosen": -1.9011521339416504, "logits/rejected": -1.892291784286499, "logps/chosen": -80.18903350830078, "logps/rejected": -47.90956115722656, "loss": 0.281, "rewards/accuracies": 1.0, "rewards/chosen": 4.687924861907959, "rewards/margins": 0.3051724433898926, "rewards/rejected": 4.382752418518066, "step": 8689 }, { "epoch": 1.92, "learning_rate": 3.8408770501810465e-08, "logits/chosen": -1.9268537759780884, "logits/rejected": -1.9268537759780884, "logps/chosen": -39.901893615722656, "logps/rejected": -39.901893615722656, "loss": 0.3913, "rewards/accuracies": 0.0, "rewards/chosen": 6.248088836669922, "rewards/margins": 0.0, "rewards/rejected": 6.248088836669922, "step": 8690 }, { "epoch": 1.92, "learning_rate": 3.818735795143746e-08, "logits/chosen": -1.9044897556304932, "logits/rejected": -1.8512829542160034, "logps/chosen": -62.27515411376953, "logps/rejected": -31.428638458251953, "loss": 0.1465, "rewards/accuracies": 1.0, "rewards/chosen": 5.0619025230407715, "rewards/margins": 1.6091241836547852, "rewards/rejected": 3.4527783393859863, "step": 8691 }, { "epoch": 1.92, "learning_rate": 3.7966582982060796e-08, "logits/chosen": -1.9252725839614868, "logits/rejected": -1.8951303958892822, "logps/chosen": -75.13581848144531, "logps/rejected": -46.29957580566406, "loss": 2.7778, "rewards/accuracies": 0.0, "rewards/chosen": 3.2137176990509033, "rewards/margins": -4.101827621459961, "rewards/rejected": 7.315545082092285, "step": 8692 }, { "epoch": 1.92, "learning_rate": 3.7746445622049435e-08, "logits/chosen": -2.019092082977295, "logits/rejected": -2.0566630363464355, "logps/chosen": -28.036699295043945, "logps/rejected": -63.59574508666992, "loss": 0.892, "rewards/accuracies": 0.0, "rewards/chosen": 1.620618224143982, "rewards/margins": -1.4156469106674194, "rewards/rejected": 3.0362651348114014, "step": 8693 }, { "epoch": 1.92, "learning_rate": 3.75269458996913e-08, "logits/chosen": -2.0488717555999756, "logits/rejected": -2.0449678897857666, "logps/chosen": -34.3717041015625, "logps/rejected": -45.135475158691406, "loss": 0.1411, "rewards/accuracies": 1.0, "rewards/chosen": 3.934277296066284, "rewards/margins": 1.3042182922363281, "rewards/rejected": 2.630059003829956, "step": 8694 }, { "epoch": 1.92, "learning_rate": 3.7308083843191066e-08, "logits/chosen": -1.9987895488739014, "logits/rejected": -1.9273145198822021, "logps/chosen": -82.42252349853516, "logps/rejected": -104.62550354003906, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 7.057872772216797, "rewards/margins": 3.6538078784942627, "rewards/rejected": 3.404064893722534, "step": 8695 }, { "epoch": 1.92, "learning_rate": 3.708985948067234e-08, "logits/chosen": -2.06266450881958, "logits/rejected": -1.9042162895202637, "logps/chosen": -56.394248962402344, "logps/rejected": -159.59083557128906, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 9.91138744354248, "rewards/margins": 3.630187511444092, "rewards/rejected": 6.281199932098389, "step": 8696 }, { "epoch": 1.92, "learning_rate": 3.687227284017547e-08, "logits/chosen": -2.130703926086426, "logits/rejected": -2.133307695388794, "logps/chosen": -33.20460891723633, "logps/rejected": -30.399696350097656, "loss": 0.5134, "rewards/accuracies": 0.0, "rewards/chosen": 3.305729389190674, "rewards/margins": -0.4844837188720703, "rewards/rejected": 3.790213108062744, "step": 8697 }, { "epoch": 1.93, "learning_rate": 3.665532394966087e-08, "logits/chosen": -2.07211971282959, "logits/rejected": -2.072587728500366, "logps/chosen": -65.48455047607422, "logps/rejected": -97.34112548828125, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 9.55971622467041, "rewards/margins": 2.2846598625183105, "rewards/rejected": 7.2750563621521, "step": 8698 }, { "epoch": 1.93, "learning_rate": 3.643901283700568e-08, "logits/chosen": -1.8862245082855225, "logits/rejected": -1.7145280838012695, "logps/chosen": -144.14871215820312, "logps/rejected": -74.27693176269531, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": 7.244510173797607, "rewards/margins": 3.707608222961426, "rewards/rejected": 3.5369019508361816, "step": 8699 }, { "epoch": 1.93, "learning_rate": 3.622333953000601e-08, "logits/chosen": -2.1057541370391846, "logits/rejected": -2.1057541370391846, "logps/chosen": -61.04127502441406, "logps/rejected": -61.04127502441406, "loss": 0.3838, "rewards/accuracies": 0.0, "rewards/chosen": 3.9527785778045654, "rewards/margins": 0.0, "rewards/rejected": 3.9527785778045654, "step": 8700 }, { "epoch": 1.93, "learning_rate": 3.600830405637357e-08, "logits/chosen": -1.7561843395233154, "logits/rejected": -1.67127525806427, "logps/chosen": -40.680992126464844, "logps/rejected": -44.208492279052734, "loss": 0.1654, "rewards/accuracies": 1.0, "rewards/chosen": 2.949364423751831, "rewards/margins": 1.063511610031128, "rewards/rejected": 1.8858528137207031, "step": 8701 }, { "epoch": 1.93, "learning_rate": 3.579390644374126e-08, "logits/chosen": -1.8606107234954834, "logits/rejected": -1.87969172000885, "logps/chosen": -40.55310821533203, "logps/rejected": -38.419349670410156, "loss": 0.4781, "rewards/accuracies": 0.0, "rewards/chosen": 3.0300278663635254, "rewards/margins": -0.38070130348205566, "rewards/rejected": 3.410729169845581, "step": 8702 }, { "epoch": 1.93, "learning_rate": 3.558014671965815e-08, "logits/chosen": -2.358752965927124, "logits/rejected": -2.320866584777832, "logps/chosen": -51.02968978881836, "logps/rejected": -20.228961944580078, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": 3.4988980293273926, "rewards/margins": 2.6427953243255615, "rewards/rejected": 0.8561027646064758, "step": 8703 }, { "epoch": 1.93, "learning_rate": 3.536702491159227e-08, "logits/chosen": -2.008408308029175, "logits/rejected": -1.9851900339126587, "logps/chosen": -131.00665283203125, "logps/rejected": -77.33302307128906, "loss": 0.2358, "rewards/accuracies": 1.0, "rewards/chosen": 7.598758220672607, "rewards/margins": 4.448683738708496, "rewards/rejected": 3.1500747203826904, "step": 8704 }, { "epoch": 1.93, "learning_rate": 3.515454104692895e-08, "logits/chosen": -2.3971261978149414, "logits/rejected": -2.4237077236175537, "logps/chosen": -37.324951171875, "logps/rejected": -63.280006408691406, "loss": 0.7317, "rewards/accuracies": 0.0, "rewards/chosen": 4.548077583312988, "rewards/margins": -1.187105655670166, "rewards/rejected": 5.735183238983154, "step": 8705 }, { "epoch": 1.93, "learning_rate": 3.494269515297188e-08, "logits/chosen": -2.1023528575897217, "logits/rejected": -2.0863759517669678, "logps/chosen": -51.522308349609375, "logps/rejected": -190.05245971679688, "loss": 0.3788, "rewards/accuracies": 1.0, "rewards/chosen": 4.836512088775635, "rewards/margins": 0.022135257720947266, "rewards/rejected": 4.8143768310546875, "step": 8706 }, { "epoch": 1.93, "learning_rate": 3.47314872569432e-08, "logits/chosen": -1.9502606391906738, "logits/rejected": -1.9642888307571411, "logps/chosen": -59.225990295410156, "logps/rejected": -105.70280456542969, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 8.482207298278809, "rewards/margins": 2.5260486602783203, "rewards/rejected": 5.956158638000488, "step": 8707 }, { "epoch": 1.93, "learning_rate": 3.4520917385981736e-08, "logits/chosen": -2.039125919342041, "logits/rejected": -2.039125919342041, "logps/chosen": -4.4534502029418945, "logps/rejected": -4.4534502029418945, "loss": 0.3485, "rewards/accuracies": 0.0, "rewards/chosen": 0.8459812998771667, "rewards/margins": 0.0, "rewards/rejected": 0.8459812998771667, "step": 8708 }, { "epoch": 1.93, "learning_rate": 3.4310985567145846e-08, "logits/chosen": -1.7168846130371094, "logits/rejected": -1.6821563243865967, "logps/chosen": -50.010231018066406, "logps/rejected": -41.17161178588867, "loss": 0.1104, "rewards/accuracies": 1.0, "rewards/chosen": 3.932213544845581, "rewards/margins": 1.5077333450317383, "rewards/rejected": 2.4244801998138428, "step": 8709 }, { "epoch": 1.93, "learning_rate": 3.410169182741174e-08, "logits/chosen": -1.919196367263794, "logits/rejected": -1.8599441051483154, "logps/chosen": -31.502338409423828, "logps/rejected": -13.568317413330078, "loss": 0.1781, "rewards/accuracies": 1.0, "rewards/chosen": 3.248732089996338, "rewards/margins": 2.2192182540893555, "rewards/rejected": 1.029513955116272, "step": 8710 }, { "epoch": 1.93, "learning_rate": 3.389303619367179e-08, "logits/chosen": -1.6635057926177979, "logits/rejected": -1.5424667596817017, "logps/chosen": -58.727813720703125, "logps/rejected": -25.60170555114746, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 6.102736949920654, "rewards/margins": 4.395876407623291, "rewards/rejected": 1.7068605422973633, "step": 8711 }, { "epoch": 1.93, "learning_rate": 3.368501869273899e-08, "logits/chosen": -2.2305634021759033, "logits/rejected": -2.237525463104248, "logps/chosen": -47.755104064941406, "logps/rejected": -22.461122512817383, "loss": 0.1062, "rewards/accuracies": 1.0, "rewards/chosen": 2.1897950172424316, "rewards/margins": 1.7461779117584229, "rewards/rejected": 0.4436170756816864, "step": 8712 }, { "epoch": 1.93, "learning_rate": 3.3477639351342515e-08, "logits/chosen": -1.765029788017273, "logits/rejected": -1.7679802179336548, "logps/chosen": -27.463632583618164, "logps/rejected": -42.63979721069336, "loss": 0.281, "rewards/accuracies": 1.0, "rewards/chosen": 3.366286516189575, "rewards/margins": 0.5030949115753174, "rewards/rejected": 2.863191604614258, "step": 8713 }, { "epoch": 1.93, "learning_rate": 3.3270898196129944e-08, "logits/chosen": -1.5567861795425415, "logits/rejected": -1.5567861795425415, "logps/chosen": -41.58139419555664, "logps/rejected": -41.58139419555664, "loss": 0.3512, "rewards/accuracies": 0.0, "rewards/chosen": 4.122169017791748, "rewards/margins": 0.0, "rewards/rejected": 4.122169017791748, "step": 8714 }, { "epoch": 1.93, "learning_rate": 3.306479525366724e-08, "logits/chosen": -1.905741572380066, "logits/rejected": -1.8547585010528564, "logps/chosen": -57.03709411621094, "logps/rejected": -42.731773376464844, "loss": 0.2449, "rewards/accuracies": 1.0, "rewards/chosen": 2.802462100982666, "rewards/margins": 0.588083028793335, "rewards/rejected": 2.214379072189331, "step": 8715 }, { "epoch": 1.93, "learning_rate": 3.2859330550438216e-08, "logits/chosen": -2.015511989593506, "logits/rejected": -2.034618616104126, "logps/chosen": -83.76216888427734, "logps/rejected": -101.75984191894531, "loss": 0.0558, "rewards/accuracies": 1.0, "rewards/chosen": 6.241718292236328, "rewards/margins": 2.1698555946350098, "rewards/rejected": 4.071862697601318, "step": 8716 }, { "epoch": 1.93, "learning_rate": 3.2654504112844546e-08, "logits/chosen": -2.083160877227783, "logits/rejected": -2.1112141609191895, "logps/chosen": -36.95237731933594, "logps/rejected": -78.17759704589844, "loss": 1.7662, "rewards/accuracies": 0.0, "rewards/chosen": 2.7575035095214844, "rewards/margins": -3.3357748985290527, "rewards/rejected": 6.093278408050537, "step": 8717 }, { "epoch": 1.93, "learning_rate": 3.2450315967205714e-08, "logits/chosen": -1.8412439823150635, "logits/rejected": -1.819828987121582, "logps/chosen": -45.7763671875, "logps/rejected": -56.20033264160156, "loss": 0.5737, "rewards/accuracies": 0.0, "rewards/chosen": 3.652686357498169, "rewards/margins": -0.6713197231292725, "rewards/rejected": 4.324006080627441, "step": 8718 }, { "epoch": 1.93, "learning_rate": 3.224676613976019e-08, "logits/chosen": -2.393643856048584, "logits/rejected": -2.4172909259796143, "logps/chosen": -84.91233825683594, "logps/rejected": -175.22373962402344, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 11.760027885437012, "rewards/margins": 4.6858229637146, "rewards/rejected": 7.074204921722412, "step": 8719 }, { "epoch": 1.93, "learning_rate": 3.2043854656662595e-08, "logits/chosen": -1.5539159774780273, "logits/rejected": -1.5336520671844482, "logps/chosen": -10.752947807312012, "logps/rejected": -6.286354064941406, "loss": 0.7267, "rewards/accuracies": 0.0, "rewards/chosen": 1.0747287273406982, "rewards/margins": -0.019904613494873047, "rewards/rejected": 1.0946333408355713, "step": 8720 }, { "epoch": 1.93, "learning_rate": 3.1841581543987644e-08, "logits/chosen": -1.9747933149337769, "logits/rejected": -1.937126636505127, "logps/chosen": -83.97212982177734, "logps/rejected": -119.95011138916016, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 7.866547584533691, "rewards/margins": 3.7950758934020996, "rewards/rejected": 4.071471691131592, "step": 8721 }, { "epoch": 1.93, "learning_rate": 3.1639946827725645e-08, "logits/chosen": -1.9141037464141846, "logits/rejected": -1.9141037464141846, "logps/chosen": -41.97087478637695, "logps/rejected": -41.97087478637695, "loss": 0.3917, "rewards/accuracies": 0.0, "rewards/chosen": 7.146711826324463, "rewards/margins": 0.0, "rewards/rejected": 7.146711826324463, "step": 8722 }, { "epoch": 1.93, "learning_rate": 3.143895053378698e-08, "logits/chosen": -2.0558524131774902, "logits/rejected": -2.0459485054016113, "logps/chosen": -47.96355438232422, "logps/rejected": -56.774444580078125, "loss": 0.2436, "rewards/accuracies": 1.0, "rewards/chosen": 4.978271007537842, "rewards/margins": 1.7707743644714355, "rewards/rejected": 3.2074966430664062, "step": 8723 }, { "epoch": 1.93, "learning_rate": 3.1238592687999337e-08, "logits/chosen": -1.7233517169952393, "logits/rejected": -1.7782832384109497, "logps/chosen": -24.349197387695312, "logps/rejected": -36.82497024536133, "loss": 3.198, "rewards/accuracies": 0.0, "rewards/chosen": 2.253925085067749, "rewards/margins": -5.956455230712891, "rewards/rejected": 8.210380554199219, "step": 8724 }, { "epoch": 1.93, "learning_rate": 3.103887331610767e-08, "logits/chosen": -2.296994209289551, "logits/rejected": -2.3131141662597656, "logps/chosen": -59.978397369384766, "logps/rejected": -86.89877319335938, "loss": 0.3575, "rewards/accuracies": 1.0, "rewards/chosen": 3.6106739044189453, "rewards/margins": 0.03275799751281738, "rewards/rejected": 3.577915906906128, "step": 8725 }, { "epoch": 1.93, "learning_rate": 3.0839792443775884e-08, "logits/chosen": -1.7769769430160522, "logits/rejected": -1.834633469581604, "logps/chosen": -54.026405334472656, "logps/rejected": -119.2800064086914, "loss": 0.0893, "rewards/accuracies": 1.0, "rewards/chosen": 8.068540573120117, "rewards/margins": 1.878401279449463, "rewards/rejected": 6.190139293670654, "step": 8726 }, { "epoch": 1.93, "learning_rate": 3.064135009658575e-08, "logits/chosen": -1.7364778518676758, "logits/rejected": -1.7364778518676758, "logps/chosen": -39.41874313354492, "logps/rejected": -39.41874313354492, "loss": 0.3547, "rewards/accuracies": 0.0, "rewards/chosen": 3.984586000442505, "rewards/margins": 0.0, "rewards/rejected": 3.984586000442505, "step": 8727 }, { "epoch": 1.93, "learning_rate": 3.0443546300035764e-08, "logits/chosen": -1.8674482107162476, "logits/rejected": -1.8674482107162476, "logps/chosen": -25.701929092407227, "logps/rejected": -25.701929092407227, "loss": 0.3468, "rewards/accuracies": 0.0, "rewards/chosen": 2.8359854221343994, "rewards/margins": 0.0, "rewards/rejected": 2.8359854221343994, "step": 8728 }, { "epoch": 1.93, "learning_rate": 3.024638107954281e-08, "logits/chosen": -2.0759546756744385, "logits/rejected": -1.9829120635986328, "logps/chosen": -72.00111389160156, "logps/rejected": -57.237998962402344, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": 7.418345928192139, "rewards/margins": 2.2800331115722656, "rewards/rejected": 5.138312816619873, "step": 8729 }, { "epoch": 1.93, "learning_rate": 3.004985446044384e-08, "logits/chosen": -1.8993620872497559, "logits/rejected": -1.8263347148895264, "logps/chosen": -68.0819091796875, "logps/rejected": -98.84635925292969, "loss": 0.0997, "rewards/accuracies": 1.0, "rewards/chosen": 3.7778306007385254, "rewards/margins": 1.6705918312072754, "rewards/rejected": 2.10723876953125, "step": 8730 }, { "epoch": 1.93, "learning_rate": 2.985396646799088e-08, "logits/chosen": -1.792141318321228, "logits/rejected": -1.78892183303833, "logps/chosen": -51.301578521728516, "logps/rejected": -72.47112274169922, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": 5.472087383270264, "rewards/margins": 3.172461986541748, "rewards/rejected": 2.2996253967285156, "step": 8731 }, { "epoch": 1.93, "learning_rate": 2.9658717127355464e-08, "logits/chosen": -1.8459635972976685, "logits/rejected": -1.848718523979187, "logps/chosen": -38.131309509277344, "logps/rejected": -85.27959442138672, "loss": 0.3657, "rewards/accuracies": 1.0, "rewards/chosen": 3.5391273498535156, "rewards/margins": 1.7665764093399048, "rewards/rejected": 1.7725509405136108, "step": 8732 }, { "epoch": 1.93, "learning_rate": 2.9464106463626408e-08, "logits/chosen": -2.484623908996582, "logits/rejected": -2.398139715194702, "logps/chosen": -46.5042724609375, "logps/rejected": -157.5354461669922, "loss": 1.2079, "rewards/accuracies": 0.0, "rewards/chosen": 6.395183563232422, "rewards/margins": -2.2269887924194336, "rewards/rejected": 8.622172355651855, "step": 8733 }, { "epoch": 1.93, "learning_rate": 2.927013450181093e-08, "logits/chosen": -2.0411388874053955, "logits/rejected": -1.998677372932434, "logps/chosen": -27.6470947265625, "logps/rejected": -53.63089370727539, "loss": 0.2227, "rewards/accuracies": 1.0, "rewards/chosen": 4.278026103973389, "rewards/margins": 0.6303174495697021, "rewards/rejected": 3.6477086544036865, "step": 8734 }, { "epoch": 1.93, "learning_rate": 2.9076801266834098e-08, "logits/chosen": -2.244776964187622, "logits/rejected": -2.235755205154419, "logps/chosen": -63.65570831298828, "logps/rejected": -62.10295104980469, "loss": 0.3186, "rewards/accuracies": 1.0, "rewards/chosen": 4.475586891174316, "rewards/margins": 0.14139556884765625, "rewards/rejected": 4.33419132232666, "step": 8735 }, { "epoch": 1.93, "learning_rate": 2.8884106783537703e-08, "logits/chosen": -2.207186222076416, "logits/rejected": -2.2226691246032715, "logps/chosen": -55.63005447387695, "logps/rejected": -59.43870544433594, "loss": 0.0711, "rewards/accuracies": 1.0, "rewards/chosen": 7.609585762023926, "rewards/margins": 3.5747804641723633, "rewards/rejected": 4.0348052978515625, "step": 8736 }, { "epoch": 1.93, "learning_rate": 2.869205107668416e-08, "logits/chosen": -1.888787031173706, "logits/rejected": -1.5291368961334229, "logps/chosen": -49.85597229003906, "logps/rejected": -230.86546325683594, "loss": 0.3118, "rewards/accuracies": 1.0, "rewards/chosen": 4.983256816864014, "rewards/margins": 0.14666461944580078, "rewards/rejected": 4.836592197418213, "step": 8737 }, { "epoch": 1.93, "learning_rate": 2.8500634170950946e-08, "logits/chosen": -2.2305221557617188, "logits/rejected": -2.221567392349243, "logps/chosen": -34.62061309814453, "logps/rejected": -80.02465057373047, "loss": 0.4252, "rewards/accuracies": 0.0, "rewards/chosen": 6.745176792144775, "rewards/margins": -0.12272262573242188, "rewards/rejected": 6.867899417877197, "step": 8738 }, { "epoch": 1.93, "learning_rate": 2.8309856090935062e-08, "logits/chosen": -2.1842293739318848, "logits/rejected": -2.195919990539551, "logps/chosen": -53.83622741699219, "logps/rejected": -86.18010711669922, "loss": 0.2078, "rewards/accuracies": 1.0, "rewards/chosen": 4.346273899078369, "rewards/margins": 1.0570182800292969, "rewards/rejected": 3.2892556190490723, "step": 8739 }, { "epoch": 1.93, "learning_rate": 2.8119716861151337e-08, "logits/chosen": -2.1815922260284424, "logits/rejected": -2.172205686569214, "logps/chosen": -46.788665771484375, "logps/rejected": -62.57607650756836, "loss": 0.3367, "rewards/accuracies": 1.0, "rewards/chosen": 6.687997341156006, "rewards/margins": 0.0657501220703125, "rewards/rejected": 6.622247219085693, "step": 8740 }, { "epoch": 1.93, "learning_rate": 2.7930216506031894e-08, "logits/chosen": -1.9018285274505615, "logits/rejected": -1.9262834787368774, "logps/chosen": -44.95064926147461, "logps/rejected": -51.87816619873047, "loss": 0.3172, "rewards/accuracies": 1.0, "rewards/chosen": 2.854663610458374, "rewards/margins": 0.15784645080566406, "rewards/rejected": 2.69681715965271, "step": 8741 }, { "epoch": 1.93, "learning_rate": 2.77413550499267e-08, "logits/chosen": -1.9959889650344849, "logits/rejected": -1.9741164445877075, "logps/chosen": -41.36866760253906, "logps/rejected": -38.81822967529297, "loss": 0.4321, "rewards/accuracies": 1.0, "rewards/chosen": 3.638617753982544, "rewards/margins": 0.9693458080291748, "rewards/rejected": 2.669271945953369, "step": 8742 }, { "epoch": 1.94, "learning_rate": 2.7553132517104675e-08, "logits/chosen": -2.150355815887451, "logits/rejected": -2.0359303951263428, "logps/chosen": -91.59241485595703, "logps/rejected": -24.524444580078125, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 6.0166850090026855, "rewards/margins": 5.800248622894287, "rewards/rejected": 0.21643619239330292, "step": 8743 }, { "epoch": 1.94, "learning_rate": 2.7365548931752028e-08, "logits/chosen": -2.0278165340423584, "logits/rejected": -1.994942545890808, "logps/chosen": -42.95927429199219, "logps/rejected": -55.50852584838867, "loss": 0.1628, "rewards/accuracies": 1.0, "rewards/chosen": 4.756181240081787, "rewards/margins": 1.1204233169555664, "rewards/rejected": 3.6357579231262207, "step": 8744 }, { "epoch": 1.94, "learning_rate": 2.7178604317971702e-08, "logits/chosen": -1.9381948709487915, "logits/rejected": -1.8948769569396973, "logps/chosen": -45.75057601928711, "logps/rejected": -59.91901397705078, "loss": 0.4979, "rewards/accuracies": 0.0, "rewards/chosen": 4.834801197052002, "rewards/margins": -0.4383854866027832, "rewards/rejected": 5.273186683654785, "step": 8745 }, { "epoch": 1.94, "learning_rate": 2.6992298699787254e-08, "logits/chosen": -2.2500617504119873, "logits/rejected": -2.2852578163146973, "logps/chosen": -123.19576263427734, "logps/rejected": -65.26722717285156, "loss": 0.1336, "rewards/accuracies": 1.0, "rewards/chosen": 7.1484904289245605, "rewards/margins": 1.6641831398010254, "rewards/rejected": 5.484307289123535, "step": 8746 }, { "epoch": 1.94, "learning_rate": 2.6806632101136764e-08, "logits/chosen": -1.6802382469177246, "logits/rejected": -1.6448760032653809, "logps/chosen": -99.34355163574219, "logps/rejected": -87.2304916381836, "loss": 0.2912, "rewards/accuracies": 1.0, "rewards/chosen": 6.562413215637207, "rewards/margins": 0.26429080963134766, "rewards/rejected": 6.298122406005859, "step": 8747 }, { "epoch": 1.94, "learning_rate": 2.6621604545879476e-08, "logits/chosen": -1.7750478982925415, "logits/rejected": -1.7750478982925415, "logps/chosen": -34.60841751098633, "logps/rejected": -34.60841751098633, "loss": 0.3479, "rewards/accuracies": 0.0, "rewards/chosen": 5.900378227233887, "rewards/margins": 0.0, "rewards/rejected": 5.900378227233887, "step": 8748 }, { "epoch": 1.94, "learning_rate": 2.6437216057790262e-08, "logits/chosen": -2.0933752059936523, "logits/rejected": -2.0034570693969727, "logps/chosen": -137.83522033691406, "logps/rejected": -103.62245178222656, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 7.181227207183838, "rewards/margins": 5.244523525238037, "rewards/rejected": 1.9367035627365112, "step": 8749 }, { "epoch": 1.94, "learning_rate": 2.625346666056239e-08, "logits/chosen": -1.9836031198501587, "logits/rejected": -1.8923419713974, "logps/chosen": -40.82681655883789, "logps/rejected": -11.554752349853516, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 4.6540327072143555, "rewards/margins": 3.6969492435455322, "rewards/rejected": 0.957083523273468, "step": 8750 }, { "epoch": 1.94, "learning_rate": 2.607035637780808e-08, "logits/chosen": -1.8536831140518188, "logits/rejected": -1.7848831415176392, "logps/chosen": -49.75138854980469, "logps/rejected": -62.48556137084961, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 5.478291988372803, "rewards/margins": 3.425140619277954, "rewards/rejected": 2.0531513690948486, "step": 8751 }, { "epoch": 1.94, "learning_rate": 2.588788523305519e-08, "logits/chosen": -1.8389124870300293, "logits/rejected": -1.8653901815414429, "logps/chosen": -79.11113739013672, "logps/rejected": -54.91122055053711, "loss": 0.273, "rewards/accuracies": 1.0, "rewards/chosen": 7.109867095947266, "rewards/margins": 2.5225281715393066, "rewards/rejected": 4.587338924407959, "step": 8752 }, { "epoch": 1.94, "learning_rate": 2.5706053249752173e-08, "logits/chosen": -1.834191083908081, "logits/rejected": -1.8617936372756958, "logps/chosen": -132.71812438964844, "logps/rejected": -95.70904541015625, "loss": 0.2475, "rewards/accuracies": 1.0, "rewards/chosen": 6.187905788421631, "rewards/margins": 0.48562145233154297, "rewards/rejected": 5.702284336090088, "step": 8753 }, { "epoch": 1.94, "learning_rate": 2.5524860451263123e-08, "logits/chosen": -1.7277847528457642, "logits/rejected": -1.6884127855300903, "logps/chosen": -32.32699966430664, "logps/rejected": -25.000383377075195, "loss": 0.2699, "rewards/accuracies": 1.0, "rewards/chosen": 2.933020830154419, "rewards/margins": 0.3348402976989746, "rewards/rejected": 2.5981805324554443, "step": 8754 }, { "epoch": 1.94, "learning_rate": 2.5344306860871636e-08, "logits/chosen": -2.104865074157715, "logits/rejected": -2.0269691944122314, "logps/chosen": -58.99448776245117, "logps/rejected": -70.15924072265625, "loss": 0.0549, "rewards/accuracies": 1.0, "rewards/chosen": 5.530272960662842, "rewards/margins": 4.442872047424316, "rewards/rejected": 1.0874007940292358, "step": 8755 }, { "epoch": 1.94, "learning_rate": 2.516439250177749e-08, "logits/chosen": -2.134666919708252, "logits/rejected": -2.131357192993164, "logps/chosen": -106.69181823730469, "logps/rejected": -63.38880920410156, "loss": 0.0803, "rewards/accuracies": 1.0, "rewards/chosen": 4.9674391746521, "rewards/margins": 1.76137375831604, "rewards/rejected": 3.2060654163360596, "step": 8756 }, { "epoch": 1.94, "learning_rate": 2.4985117397099967e-08, "logits/chosen": -1.8984706401824951, "logits/rejected": -1.900588870048523, "logps/chosen": -55.60212707519531, "logps/rejected": -44.570045471191406, "loss": 0.2974, "rewards/accuracies": 1.0, "rewards/chosen": 2.58332896232605, "rewards/margins": 0.21555161476135254, "rewards/rejected": 2.3677773475646973, "step": 8757 }, { "epoch": 1.94, "learning_rate": 2.4806481569875087e-08, "logits/chosen": -1.7813149690628052, "logits/rejected": -1.6376882791519165, "logps/chosen": -82.70622253417969, "logps/rejected": -14.661308288574219, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": 4.322256565093994, "rewards/margins": 3.338165044784546, "rewards/rejected": 0.984091579914093, "step": 8758 }, { "epoch": 1.94, "learning_rate": 2.462848504305726e-08, "logits/chosen": -1.8125343322753906, "logits/rejected": -1.823691964149475, "logps/chosen": -45.17194366455078, "logps/rejected": -66.94074249267578, "loss": 0.2121, "rewards/accuracies": 1.0, "rewards/chosen": 5.559516906738281, "rewards/margins": 0.6500701904296875, "rewards/rejected": 4.909446716308594, "step": 8759 }, { "epoch": 1.94, "learning_rate": 2.4451127839518752e-08, "logits/chosen": -2.0523383617401123, "logits/rejected": -1.9357469081878662, "logps/chosen": -42.991111755371094, "logps/rejected": -14.883628845214844, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 4.639803409576416, "rewards/margins": 4.211650848388672, "rewards/rejected": 0.428152471780777, "step": 8760 }, { "epoch": 1.94, "learning_rate": 2.4274409982049662e-08, "logits/chosen": -1.853519082069397, "logits/rejected": -1.8971318006515503, "logps/chosen": -93.64906311035156, "logps/rejected": -126.91219329833984, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 9.270617485046387, "rewards/margins": 2.848817825317383, "rewards/rejected": 6.421799659729004, "step": 8761 }, { "epoch": 1.94, "learning_rate": 2.4098331493357386e-08, "logits/chosen": -1.7353016138076782, "logits/rejected": -1.7205623388290405, "logps/chosen": -24.620738983154297, "logps/rejected": -61.14469909667969, "loss": 1.709, "rewards/accuracies": 0.0, "rewards/chosen": 2.1770999431610107, "rewards/margins": -0.3526477813720703, "rewards/rejected": 2.529747724533081, "step": 8762 }, { "epoch": 1.94, "learning_rate": 2.392289239606771e-08, "logits/chosen": -1.7307853698730469, "logits/rejected": -1.6137545108795166, "logps/chosen": -48.631614685058594, "logps/rejected": -22.114612579345703, "loss": 0.2286, "rewards/accuracies": 1.0, "rewards/chosen": 3.942974090576172, "rewards/margins": 0.5978484153747559, "rewards/rejected": 3.345125675201416, "step": 8763 }, { "epoch": 1.94, "learning_rate": 2.3748092712724268e-08, "logits/chosen": -1.9044852256774902, "logits/rejected": -1.8629603385925293, "logps/chosen": -133.47125244140625, "logps/rejected": -138.91055297851562, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": 9.489300727844238, "rewards/margins": 2.9607272148132324, "rewards/rejected": 6.528573513031006, "step": 8764 }, { "epoch": 1.94, "learning_rate": 2.3573932465787985e-08, "logits/chosen": -1.9666588306427002, "logits/rejected": -1.9318352937698364, "logps/chosen": -41.15669250488281, "logps/rejected": -23.095840454101562, "loss": 0.3038, "rewards/accuracies": 1.0, "rewards/chosen": 4.105854034423828, "rewards/margins": 0.3891575336456299, "rewards/rejected": 3.7166965007781982, "step": 8765 }, { "epoch": 1.94, "learning_rate": 2.340041167763929e-08, "logits/chosen": -2.0216808319091797, "logits/rejected": -2.019040822982788, "logps/chosen": -89.39702606201172, "logps/rejected": -83.21328735351562, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": 7.671536922454834, "rewards/margins": 4.097929954528809, "rewards/rejected": 3.5736069679260254, "step": 8766 }, { "epoch": 1.94, "learning_rate": 2.3227530370573682e-08, "logits/chosen": -1.86858332157135, "logits/rejected": -1.929314374923706, "logps/chosen": -41.23252868652344, "logps/rejected": -128.92529296875, "loss": 0.3207, "rewards/accuracies": 1.0, "rewards/chosen": 4.8665947914123535, "rewards/margins": 0.9803245067596436, "rewards/rejected": 3.88627028465271, "step": 8767 }, { "epoch": 1.94, "learning_rate": 2.3055288566806168e-08, "logits/chosen": -2.142017364501953, "logits/rejected": -2.0138540267944336, "logps/chosen": -55.26795959472656, "logps/rejected": -69.67919158935547, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": 4.5547194480896, "rewards/margins": 3.1649670600891113, "rewards/rejected": 1.3897522687911987, "step": 8768 }, { "epoch": 1.94, "learning_rate": 2.2883686288470154e-08, "logits/chosen": -2.091383218765259, "logits/rejected": -2.0815773010253906, "logps/chosen": -53.077545166015625, "logps/rejected": -83.16220092773438, "loss": 0.1083, "rewards/accuracies": 1.0, "rewards/chosen": 2.9301559925079346, "rewards/margins": 1.4316147565841675, "rewards/rejected": 1.498541235923767, "step": 8769 }, { "epoch": 1.94, "learning_rate": 2.2712723557616335e-08, "logits/chosen": -1.8461577892303467, "logits/rejected": -1.7066758871078491, "logps/chosen": -58.091156005859375, "logps/rejected": -15.110564231872559, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 3.4991614818573, "rewards/margins": 3.2272682189941406, "rewards/rejected": 0.2718932330608368, "step": 8770 }, { "epoch": 1.94, "learning_rate": 2.2542400396211582e-08, "logits/chosen": -2.079097270965576, "logits/rejected": -1.847212553024292, "logps/chosen": -89.39281463623047, "logps/rejected": -26.074493408203125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 8.820972442626953, "rewards/margins": 7.48983097076416, "rewards/rejected": 1.3311413526535034, "step": 8771 }, { "epoch": 1.94, "learning_rate": 2.237271682614339e-08, "logits/chosen": -1.8788750171661377, "logits/rejected": -1.222002625465393, "logps/chosen": -30.566593170166016, "logps/rejected": -67.439453125, "loss": 0.7972, "rewards/accuracies": 0.0, "rewards/chosen": 3.435673236846924, "rewards/margins": -1.2232065200805664, "rewards/rejected": 4.65887975692749, "step": 8772 }, { "epoch": 1.94, "learning_rate": 2.2203672869215433e-08, "logits/chosen": -1.9092986583709717, "logits/rejected": -1.9092986583709717, "logps/chosen": -53.48215866088867, "logps/rejected": -53.48215866088867, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 8.140932083129883, "rewards/margins": 0.0, "rewards/rejected": 8.140932083129883, "step": 8773 }, { "epoch": 1.94, "learning_rate": 2.203526854714866e-08, "logits/chosen": -2.091918706893921, "logits/rejected": -2.0496139526367188, "logps/chosen": -44.032508850097656, "logps/rejected": -102.7539291381836, "loss": 0.1816, "rewards/accuracies": 1.0, "rewards/chosen": 4.027210235595703, "rewards/margins": 1.0148422718048096, "rewards/rejected": 3.0123679637908936, "step": 8774 }, { "epoch": 1.94, "learning_rate": 2.1867503881584094e-08, "logits/chosen": -1.6254621744155884, "logits/rejected": -1.511687159538269, "logps/chosen": -29.91689682006836, "logps/rejected": -11.578081130981445, "loss": 0.1131, "rewards/accuracies": 1.0, "rewards/chosen": 1.9970375299453735, "rewards/margins": 1.7639122009277344, "rewards/rejected": 0.23312531411647797, "step": 8775 }, { "epoch": 1.94, "learning_rate": 2.170037889407728e-08, "logits/chosen": -1.7633483409881592, "logits/rejected": -1.7479180097579956, "logps/chosen": -87.4371109008789, "logps/rejected": -88.42424011230469, "loss": 0.1275, "rewards/accuracies": 1.0, "rewards/chosen": 4.817695140838623, "rewards/margins": 1.2375192642211914, "rewards/rejected": 3.5801758766174316, "step": 8776 }, { "epoch": 1.94, "learning_rate": 2.153389360610436e-08, "logits/chosen": -1.4407190084457397, "logits/rejected": -1.4614570140838623, "logps/chosen": -5.950037956237793, "logps/rejected": -6.941223621368408, "loss": 0.6051, "rewards/accuracies": 0.0, "rewards/chosen": 0.781104564666748, "rewards/margins": -0.6351855993270874, "rewards/rejected": 1.4162901639938354, "step": 8777 }, { "epoch": 1.94, "learning_rate": 2.1368048039058786e-08, "logits/chosen": -2.047743082046509, "logits/rejected": -1.9738266468048096, "logps/chosen": -116.74776458740234, "logps/rejected": -67.93909454345703, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": 7.771668434143066, "rewards/margins": 3.675139904022217, "rewards/rejected": 4.09652853012085, "step": 8778 }, { "epoch": 1.94, "learning_rate": 2.1202842214250175e-08, "logits/chosen": -1.7668830156326294, "logits/rejected": -1.8387832641601562, "logps/chosen": -45.49382781982422, "logps/rejected": -172.69882202148438, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": 7.098970890045166, "rewards/margins": 2.5394277572631836, "rewards/rejected": 4.559543132781982, "step": 8779 }, { "epoch": 1.94, "learning_rate": 2.1038276152907656e-08, "logits/chosen": -2.034243583679199, "logits/rejected": -1.987727165222168, "logps/chosen": -73.70110321044922, "logps/rejected": -82.29472351074219, "loss": 1.1774, "rewards/accuracies": 1.0, "rewards/chosen": 5.8121867179870605, "rewards/margins": 2.690084218978882, "rewards/rejected": 3.1221024990081787, "step": 8780 }, { "epoch": 1.94, "learning_rate": 2.0874349876177648e-08, "logits/chosen": -1.9465893507003784, "logits/rejected": -1.8797703981399536, "logps/chosen": -65.02266693115234, "logps/rejected": -17.07813262939453, "loss": 0.0839, "rewards/accuracies": 1.0, "rewards/chosen": 4.581809997558594, "rewards/margins": 4.085559368133545, "rewards/rejected": 0.496250718832016, "step": 8781 }, { "epoch": 1.94, "learning_rate": 2.0711063405124966e-08, "logits/chosen": -2.056631326675415, "logits/rejected": -2.07075572013855, "logps/chosen": -31.947383880615234, "logps/rejected": -76.99644470214844, "loss": 0.6147, "rewards/accuracies": 0.0, "rewards/chosen": 3.6223857402801514, "rewards/margins": -0.80721116065979, "rewards/rejected": 4.429596900939941, "step": 8782 }, { "epoch": 1.94, "learning_rate": 2.0548416760729495e-08, "logits/chosen": -1.749539852142334, "logits/rejected": -1.257716417312622, "logps/chosen": -37.63475799560547, "logps/rejected": -109.0574951171875, "loss": 0.8247, "rewards/accuracies": 0.0, "rewards/chosen": 3.004164934158325, "rewards/margins": -1.4210245609283447, "rewards/rejected": 4.42518949508667, "step": 8783 }, { "epoch": 1.94, "learning_rate": 2.038640996389285e-08, "logits/chosen": -2.1370086669921875, "logits/rejected": -2.1425297260284424, "logps/chosen": -35.11564636230469, "logps/rejected": -79.91921997070312, "loss": 0.711, "rewards/accuracies": 0.0, "rewards/chosen": 3.20564341545105, "rewards/margins": -1.1154029369354248, "rewards/rejected": 4.321046352386475, "step": 8784 }, { "epoch": 1.94, "learning_rate": 2.0225043035431714e-08, "logits/chosen": -1.9061020612716675, "logits/rejected": -1.8876465559005737, "logps/chosen": -34.030330657958984, "logps/rejected": -37.45127868652344, "loss": 2.1866, "rewards/accuracies": 0.0, "rewards/chosen": 2.433964252471924, "rewards/margins": -0.3191044330596924, "rewards/rejected": 2.753068685531616, "step": 8785 }, { "epoch": 1.94, "learning_rate": 2.0064315996081717e-08, "logits/chosen": -1.8236353397369385, "logits/rejected": -1.8070908784866333, "logps/chosen": -25.666183471679688, "logps/rejected": -24.679462432861328, "loss": 0.209, "rewards/accuracies": 1.0, "rewards/chosen": 3.0119049549102783, "rewards/margins": 1.0837879180908203, "rewards/rejected": 1.928117036819458, "step": 8786 }, { "epoch": 1.94, "learning_rate": 1.990422886649579e-08, "logits/chosen": -1.8796452283859253, "logits/rejected": -1.8468159437179565, "logps/chosen": -29.091373443603516, "logps/rejected": -55.211631774902344, "loss": 0.2705, "rewards/accuracies": 1.0, "rewards/chosen": 4.065433025360107, "rewards/margins": 0.5300493240356445, "rewards/rejected": 3.535383701324463, "step": 8787 }, { "epoch": 1.95, "learning_rate": 1.9744781667244138e-08, "logits/chosen": -1.8587316274642944, "logits/rejected": -1.8982304334640503, "logps/chosen": -18.17667579650879, "logps/rejected": -37.31113052368164, "loss": 0.833, "rewards/accuracies": 0.0, "rewards/chosen": 1.8570642471313477, "rewards/margins": -1.2191712856292725, "rewards/rejected": 3.07623553276062, "step": 8788 }, { "epoch": 1.95, "learning_rate": 1.9585974418815933e-08, "logits/chosen": -2.0355026721954346, "logits/rejected": -2.023898124694824, "logps/chosen": -91.67292785644531, "logps/rejected": -113.16679382324219, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": 7.510400295257568, "rewards/margins": 2.0246992111206055, "rewards/rejected": 5.485701084136963, "step": 8789 }, { "epoch": 1.95, "learning_rate": 1.9427807141617627e-08, "logits/chosen": -1.731658697128296, "logits/rejected": -1.731658697128296, "logps/chosen": -35.46916580200195, "logps/rejected": -35.46916580200195, "loss": 0.3621, "rewards/accuracies": 0.0, "rewards/chosen": 2.891223669052124, "rewards/margins": 0.0, "rewards/rejected": 2.891223669052124, "step": 8790 }, { "epoch": 1.95, "learning_rate": 1.927027985597352e-08, "logits/chosen": -1.983858346939087, "logits/rejected": -2.0329935550689697, "logps/chosen": -35.74123001098633, "logps/rejected": -105.44163513183594, "loss": 0.7326, "rewards/accuracies": 0.0, "rewards/chosen": 5.3528265953063965, "rewards/margins": -1.186098575592041, "rewards/rejected": 6.5389251708984375, "step": 8791 }, { "epoch": 1.95, "learning_rate": 1.9113392582124633e-08, "logits/chosen": -2.363983631134033, "logits/rejected": -2.3510265350341797, "logps/chosen": -55.693023681640625, "logps/rejected": -55.281219482421875, "loss": 0.0834, "rewards/accuracies": 1.0, "rewards/chosen": 4.854709148406982, "rewards/margins": 1.745227336883545, "rewards/rejected": 3.1094818115234375, "step": 8792 }, { "epoch": 1.95, "learning_rate": 1.8957145340230965e-08, "logits/chosen": -1.934260606765747, "logits/rejected": -1.8813053369522095, "logps/chosen": -102.60598754882812, "logps/rejected": -53.8117790222168, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 7.0308427810668945, "rewards/margins": 3.597590446472168, "rewards/rejected": 3.4332523345947266, "step": 8793 }, { "epoch": 1.95, "learning_rate": 1.8801538150370335e-08, "logits/chosen": -1.8929919004440308, "logits/rejected": -1.8111830949783325, "logps/chosen": -49.601234436035156, "logps/rejected": -22.621679306030273, "loss": 0.8946, "rewards/accuracies": 1.0, "rewards/chosen": 2.691295623779297, "rewards/margins": 1.3120763301849365, "rewards/rejected": 1.3792192935943604, "step": 8794 }, { "epoch": 1.95, "learning_rate": 1.864657103253731e-08, "logits/chosen": -1.6745514869689941, "logits/rejected": -1.6544960737228394, "logps/chosen": -47.2820930480957, "logps/rejected": -61.622676849365234, "loss": 0.5991, "rewards/accuracies": 0.0, "rewards/chosen": 4.485504627227783, "rewards/margins": -0.7295999526977539, "rewards/rejected": 5.215104579925537, "step": 8795 }, { "epoch": 1.95, "learning_rate": 1.84922440066454e-08, "logits/chosen": -1.7896044254302979, "logits/rejected": -1.8781132698059082, "logps/chosen": -36.81560516357422, "logps/rejected": -155.10284423828125, "loss": 0.2876, "rewards/accuracies": 1.0, "rewards/chosen": 5.445176601409912, "rewards/margins": 0.8241438865661621, "rewards/rejected": 4.62103271484375, "step": 8796 }, { "epoch": 1.95, "learning_rate": 1.83385570925243e-08, "logits/chosen": -2.2688238620758057, "logits/rejected": -2.1889679431915283, "logps/chosen": -52.26200485229492, "logps/rejected": -14.592620849609375, "loss": 1.0882, "rewards/accuracies": 1.0, "rewards/chosen": 4.135038375854492, "rewards/margins": 3.773458957672119, "rewards/rejected": 0.3615795075893402, "step": 8797 }, { "epoch": 1.95, "learning_rate": 1.818551030992377e-08, "logits/chosen": -1.9703367948532104, "logits/rejected": -1.8857327699661255, "logps/chosen": -33.181129455566406, "logps/rejected": -80.23677825927734, "loss": 1.483, "rewards/accuracies": 0.0, "rewards/chosen": 3.5198800563812256, "rewards/margins": -2.857696294784546, "rewards/rejected": 6.3775763511657715, "step": 8798 }, { "epoch": 1.95, "learning_rate": 1.8033103678508635e-08, "logits/chosen": -1.885452389717102, "logits/rejected": -1.7517695426940918, "logps/chosen": -43.93991470336914, "logps/rejected": -10.17068862915039, "loss": 0.0806, "rewards/accuracies": 1.0, "rewards/chosen": 3.9684536457061768, "rewards/margins": 3.0510201454162598, "rewards/rejected": 0.9174333810806274, "step": 8799 }, { "epoch": 1.95, "learning_rate": 1.788133721786378e-08, "logits/chosen": -2.087378740310669, "logits/rejected": -2.0932908058166504, "logps/chosen": -81.69827270507812, "logps/rejected": -81.52699279785156, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 10.284454345703125, "rewards/margins": 4.423998832702637, "rewards/rejected": 5.860455513000488, "step": 8800 }, { "epoch": 1.95, "learning_rate": 1.773021094749028e-08, "logits/chosen": -1.864084243774414, "logits/rejected": -1.864084243774414, "logps/chosen": -28.53633689880371, "logps/rejected": -28.53633689880371, "loss": 0.3549, "rewards/accuracies": 0.0, "rewards/chosen": 4.9521050453186035, "rewards/margins": 0.0, "rewards/rejected": 4.9521050453186035, "step": 8801 }, { "epoch": 1.95, "learning_rate": 1.7579724886807593e-08, "logits/chosen": -2.207303047180176, "logits/rejected": -2.2222092151641846, "logps/chosen": -95.83869171142578, "logps/rejected": -64.03582763671875, "loss": 0.1861, "rewards/accuracies": 1.0, "rewards/chosen": 9.962542533874512, "rewards/margins": 1.7829313278198242, "rewards/rejected": 8.179611206054688, "step": 8802 }, { "epoch": 1.95, "learning_rate": 1.742987905515303e-08, "logits/chosen": -1.8699487447738647, "logits/rejected": -1.8822635412216187, "logps/chosen": -16.337167739868164, "logps/rejected": -53.090633392333984, "loss": 0.6029, "rewards/accuracies": 0.0, "rewards/chosen": 2.5608975887298584, "rewards/margins": -0.8000349998474121, "rewards/rejected": 3.3609325885772705, "step": 8803 }, { "epoch": 1.95, "learning_rate": 1.7280673471781194e-08, "logits/chosen": -1.822144865989685, "logits/rejected": -1.8336364030838013, "logps/chosen": -17.046524047851562, "logps/rejected": -116.89834594726562, "loss": 0.2458, "rewards/accuracies": 1.0, "rewards/chosen": 3.754930257797241, "rewards/margins": 0.5059616565704346, "rewards/rejected": 3.2489686012268066, "step": 8804 }, { "epoch": 1.95, "learning_rate": 1.7132108155864523e-08, "logits/chosen": -1.6394342184066772, "logits/rejected": -1.6394342184066772, "logps/chosen": -21.138158798217773, "logps/rejected": -21.138158798217773, "loss": 0.412, "rewards/accuracies": 0.0, "rewards/chosen": 3.3558075428009033, "rewards/margins": 0.0, "rewards/rejected": 3.3558075428009033, "step": 8805 }, { "epoch": 1.95, "learning_rate": 1.6984183126493304e-08, "logits/chosen": -1.772937297821045, "logits/rejected": -1.7144649028778076, "logps/chosen": -43.994537353515625, "logps/rejected": -44.738800048828125, "loss": 0.1031, "rewards/accuracies": 1.0, "rewards/chosen": 2.6190781593322754, "rewards/margins": 2.030613899230957, "rewards/rejected": 0.5884643793106079, "step": 8806 }, { "epoch": 1.95, "learning_rate": 1.6836898402675662e-08, "logits/chosen": -2.110832929611206, "logits/rejected": -2.0797011852264404, "logps/chosen": -72.09540557861328, "logps/rejected": -69.20874786376953, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": 4.533670902252197, "rewards/margins": 2.3182342052459717, "rewards/rejected": 2.2154366970062256, "step": 8807 }, { "epoch": 1.95, "learning_rate": 1.669025400333757e-08, "logits/chosen": -1.8153376579284668, "logits/rejected": -1.8109625577926636, "logps/chosen": -50.695762634277344, "logps/rejected": -62.821685791015625, "loss": 0.416, "rewards/accuracies": 0.0, "rewards/chosen": 2.724297285079956, "rewards/margins": -0.2592630386352539, "rewards/rejected": 2.98356032371521, "step": 8808 }, { "epoch": 1.95, "learning_rate": 1.6544249947322845e-08, "logits/chosen": -2.0621514320373535, "logits/rejected": -1.9373542070388794, "logps/chosen": -137.4908905029297, "logps/rejected": -53.825477600097656, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 9.18553352355957, "rewards/margins": 3.818542957305908, "rewards/rejected": 5.366990566253662, "step": 8809 }, { "epoch": 1.95, "learning_rate": 1.6398886253391478e-08, "logits/chosen": -2.1260905265808105, "logits/rejected": -2.135441541671753, "logps/chosen": -41.25997543334961, "logps/rejected": -43.89389419555664, "loss": 0.4374, "rewards/accuracies": 0.0, "rewards/chosen": 3.2475240230560303, "rewards/margins": -0.3002784252166748, "rewards/rejected": 3.547802448272705, "step": 8810 }, { "epoch": 1.95, "learning_rate": 1.6254162940222973e-08, "logits/chosen": -1.817035436630249, "logits/rejected": -1.884290099143982, "logps/chosen": -45.0547981262207, "logps/rejected": -94.41153717041016, "loss": 0.318, "rewards/accuracies": 1.0, "rewards/chosen": 4.290614128112793, "rewards/margins": 0.36244702339172363, "rewards/rejected": 3.9281671047210693, "step": 8811 }, { "epoch": 1.95, "learning_rate": 1.6110080026414123e-08, "logits/chosen": -1.7859034538269043, "logits/rejected": -1.6694374084472656, "logps/chosen": -86.93798828125, "logps/rejected": -80.51654052734375, "loss": 0.2091, "rewards/accuracies": 1.0, "rewards/chosen": 5.4253997802734375, "rewards/margins": 0.9644408226013184, "rewards/rejected": 4.460958957672119, "step": 8812 }, { "epoch": 1.95, "learning_rate": 1.5966637530479e-08, "logits/chosen": -1.9149399995803833, "logits/rejected": -1.8246031999588013, "logps/chosen": -66.61312103271484, "logps/rejected": -54.114295959472656, "loss": 0.072, "rewards/accuracies": 1.0, "rewards/chosen": 3.7005670070648193, "rewards/margins": 2.4590678215026855, "rewards/rejected": 1.2414993047714233, "step": 8813 }, { "epoch": 1.95, "learning_rate": 1.5823835470849535e-08, "logits/chosen": -2.0209178924560547, "logits/rejected": -2.048258066177368, "logps/chosen": -26.843326568603516, "logps/rejected": -121.80686950683594, "loss": 0.5416, "rewards/accuracies": 1.0, "rewards/chosen": 4.2692036628723145, "rewards/margins": 3.054727554321289, "rewards/rejected": 1.2144759893417358, "step": 8814 }, { "epoch": 1.95, "learning_rate": 1.5681673865876045e-08, "logits/chosen": -2.1435599327087402, "logits/rejected": -2.051060676574707, "logps/chosen": -118.13094329833984, "logps/rejected": -81.4931411743164, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 6.231598854064941, "rewards/margins": 5.36384391784668, "rewards/rejected": 0.867755115032196, "step": 8815 }, { "epoch": 1.95, "learning_rate": 1.5540152733825033e-08, "logits/chosen": -1.9137743711471558, "logits/rejected": -1.7965224981307983, "logps/chosen": -107.9748764038086, "logps/rejected": -32.34901809692383, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 5.549392223358154, "rewards/margins": 5.282168388366699, "rewards/rejected": 0.2672237455844879, "step": 8816 }, { "epoch": 1.95, "learning_rate": 1.53992720928825e-08, "logits/chosen": -1.7742468118667603, "logits/rejected": -1.4602919816970825, "logps/chosen": -83.99137878417969, "logps/rejected": -70.60594940185547, "loss": 0.1889, "rewards/accuracies": 1.0, "rewards/chosen": 4.736654758453369, "rewards/margins": 1.326289415359497, "rewards/rejected": 3.410365343093872, "step": 8817 }, { "epoch": 1.95, "learning_rate": 1.525903196115064e-08, "logits/chosen": -1.854525089263916, "logits/rejected": -1.815540075302124, "logps/chosen": -29.95168685913086, "logps/rejected": -53.36975860595703, "loss": 0.2669, "rewards/accuracies": 1.0, "rewards/chosen": 3.7149510383605957, "rewards/margins": 0.42276501655578613, "rewards/rejected": 3.2921860218048096, "step": 8818 }, { "epoch": 1.95, "learning_rate": 1.5119432356650032e-08, "logits/chosen": -2.098994255065918, "logits/rejected": -2.0973198413848877, "logps/chosen": -58.839786529541016, "logps/rejected": -37.95332717895508, "loss": 0.1548, "rewards/accuracies": 1.0, "rewards/chosen": 3.8905436992645264, "rewards/margins": 1.064134120941162, "rewards/rejected": 2.8264095783233643, "step": 8819 }, { "epoch": 1.95, "learning_rate": 1.498047329731911e-08, "logits/chosen": -1.816255807876587, "logits/rejected": -1.828688383102417, "logps/chosen": -26.11684799194336, "logps/rejected": -73.98147583007812, "loss": 0.5151, "rewards/accuracies": 1.0, "rewards/chosen": 4.621077537536621, "rewards/margins": 0.7631733417510986, "rewards/rejected": 3.8579041957855225, "step": 8820 }, { "epoch": 1.95, "learning_rate": 1.4842154801013587e-08, "logits/chosen": -1.8106138706207275, "logits/rejected": -1.8234007358551025, "logps/chosen": -80.02876281738281, "logps/rejected": -136.92112731933594, "loss": 2.2175, "rewards/accuracies": 0.0, "rewards/chosen": 9.70507526397705, "rewards/margins": -4.417985916137695, "rewards/rejected": 14.123061180114746, "step": 8821 }, { "epoch": 1.95, "learning_rate": 1.470447688550758e-08, "logits/chosen": -1.6501930952072144, "logits/rejected": -1.656885027885437, "logps/chosen": -24.234161376953125, "logps/rejected": -35.615360260009766, "loss": 0.3271, "rewards/accuracies": 1.0, "rewards/chosen": 2.4666335582733154, "rewards/margins": 0.1439197063446045, "rewards/rejected": 2.322713851928711, "step": 8822 }, { "epoch": 1.95, "learning_rate": 1.4567439568491382e-08, "logits/chosen": -1.938785433769226, "logits/rejected": -1.938785433769226, "logps/chosen": -30.171627044677734, "logps/rejected": -30.171627044677734, "loss": 0.839, "rewards/accuracies": 0.0, "rewards/chosen": 1.8768036365509033, "rewards/margins": 0.0, "rewards/rejected": 1.8768036365509033, "step": 8823 }, { "epoch": 1.95, "learning_rate": 1.4431042867575351e-08, "logits/chosen": -1.738818645477295, "logits/rejected": -1.7074558734893799, "logps/chosen": -75.33187866210938, "logps/rejected": -82.71237182617188, "loss": 0.1034, "rewards/accuracies": 1.0, "rewards/chosen": 3.969163656234741, "rewards/margins": 1.4930665493011475, "rewards/rejected": 2.4760971069335938, "step": 8824 }, { "epoch": 1.95, "learning_rate": 1.4295286800284913e-08, "logits/chosen": -1.8331416845321655, "logits/rejected": -1.8527048826217651, "logps/chosen": -55.14634704589844, "logps/rejected": -40.80259704589844, "loss": 0.254, "rewards/accuracies": 1.0, "rewards/chosen": 6.390904903411865, "rewards/margins": 0.6421432495117188, "rewards/rejected": 5.7487616539001465, "step": 8825 }, { "epoch": 1.95, "learning_rate": 1.4160171384064447e-08, "logits/chosen": -1.9548009634017944, "logits/rejected": -1.88344144821167, "logps/chosen": -72.44709777832031, "logps/rejected": -46.371307373046875, "loss": 0.0856, "rewards/accuracies": 1.0, "rewards/chosen": 6.835354804992676, "rewards/margins": 1.681884765625, "rewards/rejected": 5.153470039367676, "step": 8826 }, { "epoch": 1.95, "learning_rate": 1.402569663627673e-08, "logits/chosen": -1.9631544351577759, "logits/rejected": -1.9611549377441406, "logps/chosen": -48.926185607910156, "logps/rejected": -59.04606628417969, "loss": 0.1329, "rewards/accuracies": 1.0, "rewards/chosen": 3.958397626876831, "rewards/margins": 1.4187507629394531, "rewards/rejected": 2.539646863937378, "step": 8827 }, { "epoch": 1.95, "learning_rate": 1.3891862574201276e-08, "logits/chosen": -1.9543120861053467, "logits/rejected": -1.82672917842865, "logps/chosen": -84.32232666015625, "logps/rejected": -85.73393249511719, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": 8.143317222595215, "rewards/margins": 3.1963887214660645, "rewards/rejected": 4.94692850112915, "step": 8828 }, { "epoch": 1.95, "learning_rate": 1.3758669215034881e-08, "logits/chosen": -2.1302218437194824, "logits/rejected": -2.0803816318511963, "logps/chosen": -107.12803649902344, "logps/rejected": -63.04090118408203, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 6.2363786697387695, "rewards/margins": 3.508836507797241, "rewards/rejected": 2.7275421619415283, "step": 8829 }, { "epoch": 1.95, "learning_rate": 1.3626116575892746e-08, "logits/chosen": -1.8748546838760376, "logits/rejected": -1.8927090167999268, "logps/chosen": -73.54733276367188, "logps/rejected": -67.03340911865234, "loss": 2.4995, "rewards/accuracies": 0.0, "rewards/chosen": 2.45391845703125, "rewards/margins": -4.972480297088623, "rewards/rejected": 7.426398754119873, "step": 8830 }, { "epoch": 1.95, "learning_rate": 1.3494204673807909e-08, "logits/chosen": -2.097667932510376, "logits/rejected": -2.0879178047180176, "logps/chosen": -32.83306884765625, "logps/rejected": -27.17761993408203, "loss": 0.319, "rewards/accuracies": 1.0, "rewards/chosen": 1.7295738458633423, "rewards/margins": 0.11501693725585938, "rewards/rejected": 1.614556908607483, "step": 8831 }, { "epoch": 1.95, "learning_rate": 1.3362933525730704e-08, "logits/chosen": -2.4098565578460693, "logits/rejected": -2.4415736198425293, "logps/chosen": -111.98495483398438, "logps/rejected": -47.10931396484375, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 7.526086330413818, "rewards/margins": 6.615208148956299, "rewards/rejected": 0.9108780026435852, "step": 8832 }, { "epoch": 1.96, "learning_rate": 1.3232303148528747e-08, "logits/chosen": -2.079394817352295, "logits/rejected": -2.045408248901367, "logps/chosen": -21.478477478027344, "logps/rejected": -55.32847213745117, "loss": 0.6463, "rewards/accuracies": 0.0, "rewards/chosen": 2.5310349464416504, "rewards/margins": -0.822523832321167, "rewards/rejected": 3.3535587787628174, "step": 8833 }, { "epoch": 1.96, "learning_rate": 1.3102313558988056e-08, "logits/chosen": -2.1772992610931396, "logits/rejected": -2.169337272644043, "logps/chosen": -52.76349639892578, "logps/rejected": -35.072052001953125, "loss": 0.1829, "rewards/accuracies": 1.0, "rewards/chosen": 4.416912078857422, "rewards/margins": 0.8786742687225342, "rewards/rejected": 3.5382378101348877, "step": 8834 }, { "epoch": 1.96, "learning_rate": 1.2972964773811935e-08, "logits/chosen": -1.9442634582519531, "logits/rejected": -1.8954901695251465, "logps/chosen": -53.20753479003906, "logps/rejected": -41.11149597167969, "loss": 1.8894, "rewards/accuracies": 1.0, "rewards/chosen": 5.282522678375244, "rewards/margins": 0.2920212745666504, "rewards/rejected": 4.990501403808594, "step": 8835 }, { "epoch": 1.96, "learning_rate": 1.2844256809621536e-08, "logits/chosen": -1.830127239227295, "logits/rejected": -1.9187488555908203, "logps/chosen": -63.92900085449219, "logps/rejected": -213.13229370117188, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 9.506654739379883, "rewards/margins": 3.9600634574890137, "rewards/rejected": 5.546591281890869, "step": 8836 }, { "epoch": 1.96, "learning_rate": 1.2716189682955294e-08, "logits/chosen": -2.2860476970672607, "logits/rejected": -2.2478909492492676, "logps/chosen": -88.48030090332031, "logps/rejected": -35.71418762207031, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": 5.443780422210693, "rewards/margins": 5.216592311859131, "rewards/rejected": 0.2271881103515625, "step": 8837 }, { "epoch": 1.96, "learning_rate": 1.2588763410268935e-08, "logits/chosen": -1.7985984086990356, "logits/rejected": -1.8282743692398071, "logps/chosen": -43.00700378417969, "logps/rejected": -72.72412109375, "loss": 1.48, "rewards/accuracies": 0.0, "rewards/chosen": 2.8849503993988037, "rewards/margins": -2.808415174484253, "rewards/rejected": 5.693365573883057, "step": 8838 }, { "epoch": 1.96, "learning_rate": 1.2461978007937692e-08, "logits/chosen": -1.937497615814209, "logits/rejected": -1.9159287214279175, "logps/chosen": -44.450286865234375, "logps/rejected": -44.44337463378906, "loss": 0.6125, "rewards/accuracies": 1.0, "rewards/chosen": 3.5685646533966064, "rewards/margins": 0.4581413269042969, "rewards/rejected": 3.1104233264923096, "step": 8839 }, { "epoch": 1.96, "learning_rate": 1.2335833492252425e-08, "logits/chosen": -1.954163908958435, "logits/rejected": -1.8853437900543213, "logps/chosen": -38.55448913574219, "logps/rejected": -99.69354248046875, "loss": 0.1625, "rewards/accuracies": 1.0, "rewards/chosen": 4.505344390869141, "rewards/margins": 1.188988447189331, "rewards/rejected": 3.3163559436798096, "step": 8840 }, { "epoch": 1.96, "learning_rate": 1.2210329879422943e-08, "logits/chosen": -2.1247105598449707, "logits/rejected": -2.1231887340545654, "logps/chosen": -104.04568481445312, "logps/rejected": -87.99305725097656, "loss": 0.084, "rewards/accuracies": 1.0, "rewards/chosen": 10.123741149902344, "rewards/margins": 1.8096590042114258, "rewards/rejected": 8.314082145690918, "step": 8841 }, { "epoch": 1.96, "learning_rate": 1.208546718557524e-08, "logits/chosen": -1.9802258014678955, "logits/rejected": -2.0062966346740723, "logps/chosen": -66.10032653808594, "logps/rejected": -84.76807403564453, "loss": 0.2281, "rewards/accuracies": 1.0, "rewards/chosen": 5.805002689361572, "rewards/margins": 0.5963654518127441, "rewards/rejected": 5.208637237548828, "step": 8842 }, { "epoch": 1.96, "learning_rate": 1.1961245426754809e-08, "logits/chosen": -2.0878307819366455, "logits/rejected": -2.1365878582000732, "logps/chosen": -40.11100769042969, "logps/rejected": -109.65724182128906, "loss": 0.5015, "rewards/accuracies": 0.0, "rewards/chosen": 6.263159275054932, "rewards/margins": -0.32778167724609375, "rewards/rejected": 6.590940952301025, "step": 8843 }, { "epoch": 1.96, "learning_rate": 1.1837664618922773e-08, "logits/chosen": -2.210498332977295, "logits/rejected": -1.5853594541549683, "logps/chosen": -45.85175323486328, "logps/rejected": -40.227081298828125, "loss": 2.0172, "rewards/accuracies": 0.0, "rewards/chosen": 2.9849770069122314, "rewards/margins": -4.016017913818359, "rewards/rejected": 7.00099515914917, "step": 8844 }, { "epoch": 1.96, "learning_rate": 1.1714724777960318e-08, "logits/chosen": -2.0842812061309814, "logits/rejected": -2.0842812061309814, "logps/chosen": -35.44706726074219, "logps/rejected": -35.44706726074219, "loss": 0.4577, "rewards/accuracies": 0.0, "rewards/chosen": 3.597599744796753, "rewards/margins": 0.0, "rewards/rejected": 3.597599744796753, "step": 8845 }, { "epoch": 1.96, "learning_rate": 1.1592425919663697e-08, "logits/chosen": -2.2846322059631348, "logits/rejected": -2.33793044090271, "logps/chosen": -112.75619506835938, "logps/rejected": -156.241455078125, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 13.14410400390625, "rewards/margins": 2.9815425872802734, "rewards/rejected": 10.162561416625977, "step": 8846 }, { "epoch": 1.96, "learning_rate": 1.1470768059748671e-08, "logits/chosen": -1.9047988653182983, "logits/rejected": -1.832895278930664, "logps/chosen": -148.1855010986328, "logps/rejected": -57.120758056640625, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 5.952029705047607, "rewards/margins": 3.3540987968444824, "rewards/rejected": 2.597930908203125, "step": 8847 }, { "epoch": 1.96, "learning_rate": 1.1349751213848293e-08, "logits/chosen": -2.289708137512207, "logits/rejected": -2.2864110469818115, "logps/chosen": -47.267555236816406, "logps/rejected": -69.15744018554688, "loss": 0.3109, "rewards/accuracies": 1.0, "rewards/chosen": 4.479920387268066, "rewards/margins": 1.4549195766448975, "rewards/rejected": 3.025000810623169, "step": 8848 }, { "epoch": 1.96, "learning_rate": 1.1229375397511788e-08, "logits/chosen": -2.029160976409912, "logits/rejected": -1.9808099269866943, "logps/chosen": -73.97745513916016, "logps/rejected": -46.92534637451172, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": 4.582206726074219, "rewards/margins": 2.401430368423462, "rewards/rejected": 2.180776357650757, "step": 8849 }, { "epoch": 1.96, "learning_rate": 1.1109640626208451e-08, "logits/chosen": -1.4810011386871338, "logits/rejected": -1.4748070240020752, "logps/chosen": -39.17470932006836, "logps/rejected": -46.219730377197266, "loss": 0.5141, "rewards/accuracies": 1.0, "rewards/chosen": 4.051976680755615, "rewards/margins": 0.9209635257720947, "rewards/rejected": 3.1310131549835205, "step": 8850 }, { "epoch": 1.96, "learning_rate": 1.0990546915323197e-08, "logits/chosen": -1.9815222024917603, "logits/rejected": -1.8043841123580933, "logps/chosen": -126.6129379272461, "logps/rejected": -77.60423278808594, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": 6.188003063201904, "rewards/margins": 4.802439212799072, "rewards/rejected": 1.3855637311935425, "step": 8851 }, { "epoch": 1.96, "learning_rate": 1.087209428015934e-08, "logits/chosen": -1.8816331624984741, "logits/rejected": -1.946256160736084, "logps/chosen": -52.81703186035156, "logps/rejected": -92.89187622070312, "loss": 0.8015, "rewards/accuracies": 0.0, "rewards/chosen": 5.786120891571045, "rewards/margins": -1.372161865234375, "rewards/rejected": 7.15828275680542, "step": 8852 }, { "epoch": 1.96, "learning_rate": 1.0754282735937483e-08, "logits/chosen": -1.772727608680725, "logits/rejected": -1.772727608680725, "logps/chosen": -17.271337509155273, "logps/rejected": -17.271337509155273, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.9033044576644897, "rewards/margins": 0.0, "rewards/rejected": 1.9033044576644897, "step": 8853 }, { "epoch": 1.96, "learning_rate": 1.063711229779718e-08, "logits/chosen": -1.6411106586456299, "logits/rejected": -1.6118699312210083, "logps/chosen": -74.5260238647461, "logps/rejected": -53.43158721923828, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": 6.2327799797058105, "rewards/margins": 3.0122194290161133, "rewards/rejected": 3.2205605506896973, "step": 8854 }, { "epoch": 1.96, "learning_rate": 1.0520582980793614e-08, "logits/chosen": -1.957861065864563, "logits/rejected": -1.8877811431884766, "logps/chosen": -63.75102996826172, "logps/rejected": -156.2167205810547, "loss": 0.2092, "rewards/accuracies": 1.0, "rewards/chosen": 7.313648223876953, "rewards/margins": 0.659456729888916, "rewards/rejected": 6.654191493988037, "step": 8855 }, { "epoch": 1.96, "learning_rate": 1.0404694799900361e-08, "logits/chosen": -1.6736092567443848, "logits/rejected": -1.6888986825942993, "logps/chosen": -46.39872741699219, "logps/rejected": -62.13710403442383, "loss": 0.7322, "rewards/accuracies": 0.0, "rewards/chosen": 4.44180154800415, "rewards/margins": -1.1911439895629883, "rewards/rejected": 5.632945537567139, "step": 8856 }, { "epoch": 1.96, "learning_rate": 1.0289447770009398e-08, "logits/chosen": -1.5004955530166626, "logits/rejected": -1.557552695274353, "logps/chosen": -16.887910842895508, "logps/rejected": -66.97853088378906, "loss": 0.9523, "rewards/accuracies": 0.0, "rewards/chosen": 2.9258077144622803, "rewards/margins": -1.5095784664154053, "rewards/rejected": 4.4353861808776855, "step": 8857 }, { "epoch": 1.96, "learning_rate": 1.0174841905929433e-08, "logits/chosen": -2.197066068649292, "logits/rejected": -2.177551746368408, "logps/chosen": -73.21061706542969, "logps/rejected": -47.95944595336914, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 5.254552364349365, "rewards/margins": 2.888855218887329, "rewards/rejected": 2.365697145462036, "step": 8858 }, { "epoch": 1.96, "learning_rate": 1.0060877222387578e-08, "logits/chosen": -1.9715945720672607, "logits/rejected": -1.9403022527694702, "logps/chosen": -98.51837158203125, "logps/rejected": -81.07307434082031, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 7.177606105804443, "rewards/margins": 3.9153242111206055, "rewards/rejected": 3.262281894683838, "step": 8859 }, { "epoch": 1.96, "learning_rate": 9.947553734027116e-09, "logits/chosen": -2.1675100326538086, "logits/rejected": -2.115834951400757, "logps/chosen": -42.90498352050781, "logps/rejected": -15.96222972869873, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": 5.2074875831604, "rewards/margins": 3.2221903800964355, "rewards/rejected": 1.9852970838546753, "step": 8860 }, { "epoch": 1.96, "learning_rate": 9.834871455409734e-09, "logits/chosen": -1.9481632709503174, "logits/rejected": -1.9778915643692017, "logps/chosen": -45.103736877441406, "logps/rejected": -142.45611572265625, "loss": 0.3018, "rewards/accuracies": 1.0, "rewards/chosen": 4.651891231536865, "rewards/margins": 0.5328025817871094, "rewards/rejected": 4.119088649749756, "step": 8861 }, { "epoch": 1.96, "learning_rate": 9.72283040101607e-09, "logits/chosen": -2.182248592376709, "logits/rejected": -2.1538801193237305, "logps/chosen": -97.08012390136719, "logps/rejected": -68.91244506835938, "loss": 0.1698, "rewards/accuracies": 1.0, "rewards/chosen": 5.706756591796875, "rewards/margins": 0.9067397117614746, "rewards/rejected": 4.8000168800354, "step": 8862 }, { "epoch": 1.96, "learning_rate": 9.611430585242387e-09, "logits/chosen": -1.9970180988311768, "logits/rejected": -2.00940203666687, "logps/chosen": -42.096343994140625, "logps/rejected": -68.03182983398438, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 6.648593425750732, "rewards/margins": 2.499588966369629, "rewards/rejected": 4.1490044593811035, "step": 8863 }, { "epoch": 1.96, "learning_rate": 9.50067202240279e-09, "logits/chosen": -1.6906237602233887, "logits/rejected": -1.6699528694152832, "logps/chosen": -34.02151107788086, "logps/rejected": -52.822425842285156, "loss": 0.222, "rewards/accuracies": 1.0, "rewards/chosen": 2.7283246517181396, "rewards/margins": 0.6189649105072021, "rewards/rejected": 2.1093597412109375, "step": 8864 }, { "epoch": 1.96, "learning_rate": 9.390554726730338e-09, "logits/chosen": -2.0417819023132324, "logits/rejected": -2.051482677459717, "logps/chosen": -55.462181091308594, "logps/rejected": -121.55218505859375, "loss": 0.7091, "rewards/accuracies": 0.0, "rewards/chosen": 5.465592384338379, "rewards/margins": -0.8458914756774902, "rewards/rejected": 6.311483860015869, "step": 8865 }, { "epoch": 1.96, "learning_rate": 9.281078712374825e-09, "logits/chosen": -2.0848166942596436, "logits/rejected": -2.1620094776153564, "logps/chosen": -23.81270980834961, "logps/rejected": -144.86578369140625, "loss": 1.5886, "rewards/accuracies": 0.0, "rewards/chosen": 3.0070042610168457, "rewards/margins": -3.1052026748657227, "rewards/rejected": 6.112206935882568, "step": 8866 }, { "epoch": 1.96, "learning_rate": 9.172243993402774e-09, "logits/chosen": -1.726374626159668, "logits/rejected": -1.6144161224365234, "logps/chosen": -53.155372619628906, "logps/rejected": -7.86954927444458, "loss": 0.1188, "rewards/accuracies": 1.0, "rewards/chosen": 3.8846237659454346, "rewards/margins": 2.283604145050049, "rewards/rejected": 1.6010197401046753, "step": 8867 }, { "epoch": 1.96, "learning_rate": 9.06405058380022e-09, "logits/chosen": -1.9626364707946777, "logits/rejected": -1.9416249990463257, "logps/chosen": -47.21638107299805, "logps/rejected": -61.275413513183594, "loss": 0.6888, "rewards/accuracies": 0.0, "rewards/chosen": 3.3724095821380615, "rewards/margins": -0.9030749797821045, "rewards/rejected": 4.275484561920166, "step": 8868 }, { "epoch": 1.96, "learning_rate": 8.956498497468823e-09, "logits/chosen": -1.930209994316101, "logits/rejected": -1.9561991691589355, "logps/chosen": -33.976165771484375, "logps/rejected": -81.79871368408203, "loss": 0.2061, "rewards/accuracies": 1.0, "rewards/chosen": 4.019649505615234, "rewards/margins": 0.8853378295898438, "rewards/rejected": 3.1343116760253906, "step": 8869 }, { "epoch": 1.96, "learning_rate": 8.849587748229193e-09, "logits/chosen": -2.149700164794922, "logits/rejected": -1.9825083017349243, "logps/chosen": -148.75054931640625, "logps/rejected": -70.96792602539062, "loss": 0.0739, "rewards/accuracies": 1.0, "rewards/chosen": 9.294656753540039, "rewards/margins": 5.282323837280273, "rewards/rejected": 4.012332916259766, "step": 8870 }, { "epoch": 1.96, "learning_rate": 8.743318349819784e-09, "logits/chosen": -1.6984097957611084, "logits/rejected": -1.649941086769104, "logps/chosen": -84.72538757324219, "logps/rejected": -39.27685546875, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": 4.871702671051025, "rewards/margins": 2.262723684310913, "rewards/rejected": 2.6089789867401123, "step": 8871 }, { "epoch": 1.96, "learning_rate": 8.637690315894676e-09, "logits/chosen": -1.8526490926742554, "logits/rejected": -1.5033313035964966, "logps/chosen": -32.6352653503418, "logps/rejected": -45.112152099609375, "loss": 0.2644, "rewards/accuracies": 1.0, "rewards/chosen": 2.1631886959075928, "rewards/margins": 0.37043964862823486, "rewards/rejected": 1.792749047279358, "step": 8872 }, { "epoch": 1.96, "learning_rate": 8.532703660028008e-09, "logits/chosen": -1.418533444404602, "logits/rejected": -1.4087249040603638, "logps/chosen": -69.24006652832031, "logps/rejected": -44.57468032836914, "loss": 0.3232, "rewards/accuracies": 1.0, "rewards/chosen": 4.314566135406494, "rewards/margins": 0.7820956707000732, "rewards/rejected": 3.532470464706421, "step": 8873 }, { "epoch": 1.96, "learning_rate": 8.428358395709546e-09, "logits/chosen": -1.8068394660949707, "logits/rejected": -1.7573974132537842, "logps/chosen": -93.84623718261719, "logps/rejected": -70.37628173828125, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": 6.542721748352051, "rewards/margins": 3.099461555480957, "rewards/rejected": 3.4432601928710938, "step": 8874 }, { "epoch": 1.96, "learning_rate": 8.324654536347453e-09, "logits/chosen": -1.8861173391342163, "logits/rejected": -1.8678661584854126, "logps/chosen": -56.430538177490234, "logps/rejected": -94.96955871582031, "loss": 0.4214, "rewards/accuracies": 1.0, "rewards/chosen": 6.319172382354736, "rewards/margins": 0.42946290969848633, "rewards/rejected": 5.88970947265625, "step": 8875 }, { "epoch": 1.96, "learning_rate": 8.22159209526774e-09, "logits/chosen": -1.7564703226089478, "logits/rejected": -1.7199382781982422, "logps/chosen": -79.6839828491211, "logps/rejected": -84.86799621582031, "loss": 0.274, "rewards/accuracies": 1.0, "rewards/chosen": 2.829723358154297, "rewards/margins": 0.600919246673584, "rewards/rejected": 2.228804111480713, "step": 8876 }, { "epoch": 1.96, "learning_rate": 8.119171085713696e-09, "logits/chosen": -1.8310884237289429, "logits/rejected": -1.6667511463165283, "logps/chosen": -78.3638916015625, "logps/rejected": -20.730682373046875, "loss": 0.5232, "rewards/accuracies": 1.0, "rewards/chosen": 2.5843780040740967, "rewards/margins": 1.7815390825271606, "rewards/rejected": 0.802838921546936, "step": 8877 }, { "epoch": 1.97, "learning_rate": 8.017391520845352e-09, "logits/chosen": -2.043245553970337, "logits/rejected": -2.044663190841675, "logps/chosen": -71.85022735595703, "logps/rejected": -90.1221923828125, "loss": 0.1215, "rewards/accuracies": 1.0, "rewards/chosen": 8.522238731384277, "rewards/margins": 1.2932453155517578, "rewards/rejected": 7.2289934158325195, "step": 8878 }, { "epoch": 1.97, "learning_rate": 7.916253413742247e-09, "logits/chosen": -1.8913835287094116, "logits/rejected": -1.6958017349243164, "logps/chosen": -122.00871276855469, "logps/rejected": -17.390790939331055, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 7.653968811035156, "rewards/margins": 7.349886894226074, "rewards/rejected": 0.30408191680908203, "step": 8879 }, { "epoch": 1.97, "learning_rate": 7.815756777400096e-09, "logits/chosen": -1.8695048093795776, "logits/rejected": -1.7995675802230835, "logps/chosen": -55.03816604614258, "logps/rejected": -27.33721923828125, "loss": 0.4888, "rewards/accuracies": 0.0, "rewards/chosen": 4.3187055587768555, "rewards/margins": -0.3983492851257324, "rewards/rejected": 4.717054843902588, "step": 8880 }, { "epoch": 1.97, "learning_rate": 7.715901624732458e-09, "logits/chosen": -1.8800050020217896, "logits/rejected": -1.8800050020217896, "logps/chosen": -136.279296875, "logps/rejected": -136.279296875, "loss": 0.3501, "rewards/accuracies": 0.0, "rewards/chosen": 10.968459129333496, "rewards/margins": 0.0, "rewards/rejected": 10.968459129333496, "step": 8881 }, { "epoch": 1.97, "learning_rate": 7.61668796857018e-09, "logits/chosen": -1.8575665950775146, "logits/rejected": -1.8351178169250488, "logps/chosen": -51.85530090332031, "logps/rejected": -58.982872009277344, "loss": 0.1926, "rewards/accuracies": 1.0, "rewards/chosen": 3.3302841186523438, "rewards/margins": 0.7562522888183594, "rewards/rejected": 2.5740318298339844, "step": 8882 }, { "epoch": 1.97, "learning_rate": 7.518115821661953e-09, "logits/chosen": -1.977019190788269, "logits/rejected": -1.9584195613861084, "logps/chosen": -36.83367919921875, "logps/rejected": -50.79588317871094, "loss": 0.3127, "rewards/accuracies": 1.0, "rewards/chosen": 2.5724189281463623, "rewards/margins": 0.4458656311035156, "rewards/rejected": 2.1265532970428467, "step": 8883 }, { "epoch": 1.97, "learning_rate": 7.420185196674312e-09, "logits/chosen": -1.8741750717163086, "logits/rejected": -1.5764778852462769, "logps/chosen": -36.92784881591797, "logps/rejected": -85.34113311767578, "loss": 1.7722, "rewards/accuracies": 0.0, "rewards/chosen": 3.7079827785491943, "rewards/margins": -3.47391676902771, "rewards/rejected": 7.181899547576904, "step": 8884 }, { "epoch": 1.97, "learning_rate": 7.32289610619108e-09, "logits/chosen": -1.5956525802612305, "logits/rejected": -1.4398910999298096, "logps/chosen": -150.85745239257812, "logps/rejected": -64.49601745605469, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 6.079776287078857, "rewards/margins": 2.408128499984741, "rewards/rejected": 3.671647787094116, "step": 8885 }, { "epoch": 1.97, "learning_rate": 7.226248562713367e-09, "logits/chosen": -1.9865058660507202, "logits/rejected": -1.9270812273025513, "logps/chosen": -95.1409912109375, "logps/rejected": -64.37451934814453, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 6.640896797180176, "rewards/margins": 4.14678955078125, "rewards/rejected": 2.4941070079803467, "step": 8886 }, { "epoch": 1.97, "learning_rate": 7.1302425786606845e-09, "logits/chosen": -1.9905853271484375, "logits/rejected": -2.0252108573913574, "logps/chosen": -97.08541107177734, "logps/rejected": -148.33819580078125, "loss": 0.124, "rewards/accuracies": 1.0, "rewards/chosen": 8.32736873626709, "rewards/margins": 1.3174290657043457, "rewards/rejected": 7.009939670562744, "step": 8887 }, { "epoch": 1.97, "learning_rate": 7.034878166369275e-09, "logits/chosen": -1.9908093214035034, "logits/rejected": -1.9326896667480469, "logps/chosen": -103.21804809570312, "logps/rejected": -59.86762237548828, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 6.7476091384887695, "rewards/margins": 3.674697160720825, "rewards/rejected": 3.0729119777679443, "step": 8888 }, { "epoch": 1.97, "learning_rate": 6.940155338093224e-09, "logits/chosen": -1.969390869140625, "logits/rejected": -1.9279049634933472, "logps/chosen": -290.8673095703125, "logps/rejected": -59.5948486328125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 11.036211967468262, "rewards/margins": 5.965347766876221, "rewards/rejected": 5.070864200592041, "step": 8889 }, { "epoch": 1.97, "learning_rate": 6.846074106004464e-09, "logits/chosen": -2.270385503768921, "logits/rejected": -2.268144369125366, "logps/chosen": -36.85868835449219, "logps/rejected": -44.423728942871094, "loss": 0.1922, "rewards/accuracies": 1.0, "rewards/chosen": 4.149556636810303, "rewards/margins": 1.2039403915405273, "rewards/rejected": 2.9456162452697754, "step": 8890 }, { "epoch": 1.97, "learning_rate": 6.752634482191655e-09, "logits/chosen": -1.7684080600738525, "logits/rejected": -1.8364191055297852, "logps/chosen": -40.861820220947266, "logps/rejected": -85.97000122070312, "loss": 1.4679, "rewards/accuracies": 0.0, "rewards/chosen": 4.724377155303955, "rewards/margins": -2.7991580963134766, "rewards/rejected": 7.523535251617432, "step": 8891 }, { "epoch": 1.97, "learning_rate": 6.659836478662418e-09, "logits/chosen": -1.9124311208724976, "logits/rejected": -1.8237278461456299, "logps/chosen": -79.18498992919922, "logps/rejected": -67.62318420410156, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 5.1029181480407715, "rewards/margins": 4.427103519439697, "rewards/rejected": 0.6758148074150085, "step": 8892 }, { "epoch": 1.97, "learning_rate": 6.567680107339991e-09, "logits/chosen": -1.6620675325393677, "logits/rejected": -1.6083711385726929, "logps/chosen": -59.14271545410156, "logps/rejected": -20.819095611572266, "loss": 0.536, "rewards/accuracies": 1.0, "rewards/chosen": 3.5071258544921875, "rewards/margins": 1.0302042961120605, "rewards/rejected": 2.476921558380127, "step": 8893 }, { "epoch": 1.97, "learning_rate": 6.476165380067123e-09, "logits/chosen": -1.760201334953308, "logits/rejected": -1.7593393325805664, "logps/chosen": -37.63301086425781, "logps/rejected": -59.023223876953125, "loss": 0.3044, "rewards/accuracies": 1.0, "rewards/chosen": 3.271765947341919, "rewards/margins": 0.3368675708770752, "rewards/rejected": 2.9348983764648438, "step": 8894 }, { "epoch": 1.97, "learning_rate": 6.3852923086027415e-09, "logits/chosen": -1.9515528678894043, "logits/rejected": -1.9504954814910889, "logps/chosen": -30.29741668701172, "logps/rejected": -58.97848892211914, "loss": 0.1649, "rewards/accuracies": 1.0, "rewards/chosen": 4.084632873535156, "rewards/margins": 1.2623684406280518, "rewards/rejected": 2.8222644329071045, "step": 8895 }, { "epoch": 1.97, "learning_rate": 6.295060904623618e-09, "logits/chosen": -1.9663699865341187, "logits/rejected": -1.9663699865341187, "logps/chosen": -24.00179100036621, "logps/rejected": -24.00179100036621, "loss": 0.5057, "rewards/accuracies": 0.0, "rewards/chosen": 6.431743621826172, "rewards/margins": 0.0, "rewards/rejected": 6.431743621826172, "step": 8896 }, { "epoch": 1.97, "learning_rate": 6.20547117972492e-09, "logits/chosen": -2.1644136905670166, "logits/rejected": -2.1858701705932617, "logps/chosen": -55.677215576171875, "logps/rejected": -41.181148529052734, "loss": 0.8331, "rewards/accuracies": 1.0, "rewards/chosen": 2.6718697547912598, "rewards/margins": 0.3089413642883301, "rewards/rejected": 2.3629283905029297, "step": 8897 }, { "epoch": 1.97, "learning_rate": 6.1165231454185515e-09, "logits/chosen": -1.709146499633789, "logits/rejected": -1.7053018808364868, "logps/chosen": -32.998130798339844, "logps/rejected": -58.16883850097656, "loss": 0.2035, "rewards/accuracies": 1.0, "rewards/chosen": 2.982943296432495, "rewards/margins": 0.7216892242431641, "rewards/rejected": 2.261254072189331, "step": 8898 }, { "epoch": 1.97, "learning_rate": 6.028216813133702e-09, "logits/chosen": -2.0706543922424316, "logits/rejected": -2.1132986545562744, "logps/chosen": -148.98789978027344, "logps/rejected": -54.17692565917969, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 8.536005973815918, "rewards/margins": 3.680384635925293, "rewards/rejected": 4.855621337890625, "step": 8899 }, { "epoch": 1.97, "learning_rate": 5.940552194218518e-09, "logits/chosen": -1.8499133586883545, "logits/rejected": -1.7978588342666626, "logps/chosen": -52.04218292236328, "logps/rejected": -58.56127166748047, "loss": 0.1717, "rewards/accuracies": 1.0, "rewards/chosen": 4.113404273986816, "rewards/margins": 1.5200350284576416, "rewards/rejected": 2.593369245529175, "step": 8900 }, { "epoch": 1.97, "learning_rate": 5.8535292999362115e-09, "logits/chosen": -1.7492772340774536, "logits/rejected": -1.7492772340774536, "logps/chosen": -26.280742645263672, "logps/rejected": -26.280742645263672, "loss": 0.3572, "rewards/accuracies": 0.0, "rewards/chosen": 4.694119930267334, "rewards/margins": 0.0, "rewards/rejected": 4.694119930267334, "step": 8901 }, { "epoch": 1.97, "learning_rate": 5.767148141470058e-09, "logits/chosen": -2.175976037979126, "logits/rejected": -2.1589274406433105, "logps/chosen": -67.03900146484375, "logps/rejected": -74.70819854736328, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 6.83099365234375, "rewards/margins": 4.289815425872803, "rewards/rejected": 2.5411782264709473, "step": 8902 }, { "epoch": 1.97, "learning_rate": 5.681408729919513e-09, "logits/chosen": -1.7495579719543457, "logits/rejected": -1.7495579719543457, "logps/chosen": -70.845947265625, "logps/rejected": -70.845947265625, "loss": 0.4587, "rewards/accuracies": 0.0, "rewards/chosen": 3.999809980392456, "rewards/margins": 0.0, "rewards/rejected": 3.999809980392456, "step": 8903 }, { "epoch": 1.97, "learning_rate": 5.596311076302429e-09, "logits/chosen": -2.338209629058838, "logits/rejected": -2.3157966136932373, "logps/chosen": -50.108131408691406, "logps/rejected": -21.57322883605957, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 2.892873525619507, "rewards/margins": 2.713671922683716, "rewards/rejected": 0.17920151352882385, "step": 8904 }, { "epoch": 1.97, "learning_rate": 5.511855191552284e-09, "logits/chosen": -1.8910826444625854, "logits/rejected": -2.0535550117492676, "logps/chosen": -46.789005279541016, "logps/rejected": -22.863567352294922, "loss": 0.4641, "rewards/accuracies": 1.0, "rewards/chosen": 4.0508222579956055, "rewards/margins": 2.4009056091308594, "rewards/rejected": 1.6499165296554565, "step": 8905 }, { "epoch": 1.97, "learning_rate": 5.428041086523173e-09, "logits/chosen": -1.881206750869751, "logits/rejected": -1.881206750869751, "logps/chosen": -38.174285888671875, "logps/rejected": -38.174285888671875, "loss": 0.4225, "rewards/accuracies": 0.0, "rewards/chosen": 5.250699043273926, "rewards/margins": 0.0, "rewards/rejected": 5.250699043273926, "step": 8906 }, { "epoch": 1.97, "learning_rate": 5.3448687719837024e-09, "logits/chosen": -2.3632640838623047, "logits/rejected": -2.3632640838623047, "logps/chosen": -74.64451599121094, "logps/rejected": -74.64451599121094, "loss": 0.3558, "rewards/accuracies": 0.0, "rewards/chosen": 4.889222621917725, "rewards/margins": 0.0, "rewards/rejected": 4.889222621917725, "step": 8907 }, { "epoch": 1.97, "learning_rate": 5.26233825862199e-09, "logits/chosen": -2.1754515171051025, "logits/rejected": -2.181358575820923, "logps/chosen": -46.86720657348633, "logps/rejected": -49.038299560546875, "loss": 1.0817, "rewards/accuracies": 1.0, "rewards/chosen": 4.155233383178711, "rewards/margins": 1.3809406757354736, "rewards/rejected": 2.7742927074432373, "step": 8908 }, { "epoch": 1.97, "learning_rate": 5.180449557042333e-09, "logits/chosen": -1.6973356008529663, "logits/rejected": -1.6973356008529663, "logps/chosen": -37.87818908691406, "logps/rejected": -37.87818908691406, "loss": 0.35, "rewards/accuracies": 0.0, "rewards/chosen": 3.3903214931488037, "rewards/margins": 0.0, "rewards/rejected": 3.3903214931488037, "step": 8909 }, { "epoch": 1.97, "learning_rate": 5.099202677767978e-09, "logits/chosen": -1.8673357963562012, "logits/rejected": -1.831117033958435, "logps/chosen": -58.81705093383789, "logps/rejected": -86.77618408203125, "loss": 1.2359, "rewards/accuracies": 0.0, "rewards/chosen": 6.279784202575684, "rewards/margins": -2.242539405822754, "rewards/rejected": 8.522323608398438, "step": 8910 }, { "epoch": 1.97, "learning_rate": 5.0185976312389085e-09, "logits/chosen": -1.6653748750686646, "logits/rejected": -1.547890543937683, "logps/chosen": -42.7056884765625, "logps/rejected": -13.689257621765137, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": 3.538412570953369, "rewards/margins": 2.7436299324035645, "rewards/rejected": 0.7947827577590942, "step": 8911 }, { "epoch": 1.97, "learning_rate": 4.938634427812394e-09, "logits/chosen": -1.5386006832122803, "logits/rejected": -1.4113872051239014, "logps/chosen": -37.85050582885742, "logps/rejected": -27.28423309326172, "loss": 0.5139, "rewards/accuracies": 0.0, "rewards/chosen": 1.7999600172042847, "rewards/margins": -0.5429474115371704, "rewards/rejected": 2.342907428741455, "step": 8912 }, { "epoch": 1.97, "learning_rate": 4.859313077762995e-09, "logits/chosen": -2.136413097381592, "logits/rejected": -2.136413097381592, "logps/chosen": -56.519622802734375, "logps/rejected": -56.519622802734375, "loss": 0.5555, "rewards/accuracies": 0.0, "rewards/chosen": 3.23785400390625, "rewards/margins": 0.0, "rewards/rejected": 3.23785400390625, "step": 8913 }, { "epoch": 1.97, "learning_rate": 4.780633591284778e-09, "logits/chosen": -2.1516480445861816, "logits/rejected": -2.1458847522735596, "logps/chosen": -60.597686767578125, "logps/rejected": -23.993379592895508, "loss": 0.3642, "rewards/accuracies": 1.0, "rewards/chosen": 1.5744285583496094, "rewards/margins": 0.366752028465271, "rewards/rejected": 1.2076765298843384, "step": 8914 }, { "epoch": 1.97, "learning_rate": 4.702595978486324e-09, "logits/chosen": -1.8804084062576294, "logits/rejected": -1.8804084062576294, "logps/chosen": -31.171510696411133, "logps/rejected": -31.171510696411133, "loss": 0.3594, "rewards/accuracies": 0.0, "rewards/chosen": 4.824670314788818, "rewards/margins": 0.0, "rewards/rejected": 4.824670314788818, "step": 8915 }, { "epoch": 1.97, "learning_rate": 4.6252002493962775e-09, "logits/chosen": -1.899031162261963, "logits/rejected": -1.945847749710083, "logps/chosen": -50.882598876953125, "logps/rejected": -56.77046203613281, "loss": 1.7872, "rewards/accuracies": 0.0, "rewards/chosen": 4.174285888671875, "rewards/margins": -3.378190517425537, "rewards/rejected": 7.552476406097412, "step": 8916 }, { "epoch": 1.97, "learning_rate": 4.5484464139589066e-09, "logits/chosen": -1.7782334089279175, "logits/rejected": -1.6852115392684937, "logps/chosen": -34.9454345703125, "logps/rejected": -50.74026107788086, "loss": 0.3659, "rewards/accuracies": 1.0, "rewards/chosen": 3.124544620513916, "rewards/margins": 0.09537696838378906, "rewards/rejected": 3.029167652130127, "step": 8917 }, { "epoch": 1.97, "learning_rate": 4.4723344820379874e-09, "logits/chosen": -2.0580999851226807, "logits/rejected": -1.8753397464752197, "logps/chosen": -117.49946594238281, "logps/rejected": -22.527050018310547, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 7.076942443847656, "rewards/margins": 6.986366271972656, "rewards/rejected": 0.09057598561048508, "step": 8918 }, { "epoch": 1.97, "learning_rate": 4.39686446341292e-09, "logits/chosen": -1.776282787322998, "logits/rejected": -1.7404603958129883, "logps/chosen": -23.8034610748291, "logps/rejected": -7.539295196533203, "loss": 0.0796, "rewards/accuracies": 1.0, "rewards/chosen": 2.8306643962860107, "rewards/margins": 2.4913692474365234, "rewards/rejected": 0.3392952084541321, "step": 8919 }, { "epoch": 1.97, "learning_rate": 4.322036367781501e-09, "logits/chosen": -1.7332054376602173, "logits/rejected": -1.7332054376602173, "logps/chosen": -33.09569549560547, "logps/rejected": -33.09569549560547, "loss": 0.3633, "rewards/accuracies": 0.0, "rewards/chosen": 2.032400608062744, "rewards/margins": 0.0, "rewards/rejected": 2.032400608062744, "step": 8920 }, { "epoch": 1.97, "learning_rate": 4.2478502047593735e-09, "logits/chosen": -1.7452369928359985, "logits/rejected": -1.7225425243377686, "logps/chosen": -22.968109130859375, "logps/rejected": -38.01616668701172, "loss": 0.3691, "rewards/accuracies": 1.0, "rewards/chosen": 2.7502007484436035, "rewards/margins": 0.569694995880127, "rewards/rejected": 2.1805057525634766, "step": 8921 }, { "epoch": 1.97, "learning_rate": 4.174305983878912e-09, "logits/chosen": -2.2189958095550537, "logits/rejected": -2.239630937576294, "logps/chosen": -73.02759552001953, "logps/rejected": -56.38872528076172, "loss": 0.1272, "rewards/accuracies": 1.0, "rewards/chosen": 7.934969425201416, "rewards/margins": 2.8011980056762695, "rewards/rejected": 5.1337714195251465, "step": 8922 }, { "epoch": 1.97, "learning_rate": 4.1014037145908906e-09, "logits/chosen": -1.7100294828414917, "logits/rejected": -1.709954857826233, "logps/chosen": -35.16466522216797, "logps/rejected": -64.18869018554688, "loss": 1.1374, "rewards/accuracies": 0.0, "rewards/chosen": 2.4269402027130127, "rewards/margins": -1.9804236888885498, "rewards/rejected": 4.4073638916015625, "step": 8923 }, { "epoch": 1.98, "learning_rate": 4.02914340626226e-09, "logits/chosen": -2.0294876098632812, "logits/rejected": -1.9322277307510376, "logps/chosen": -70.30430603027344, "logps/rejected": -140.69342041015625, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": 9.624211311340332, "rewards/margins": 1.6718826293945312, "rewards/rejected": 7.952328681945801, "step": 8924 }, { "epoch": 1.98, "learning_rate": 3.957525068178925e-09, "logits/chosen": -1.7385733127593994, "logits/rejected": -1.6222237348556519, "logps/chosen": -27.884689331054688, "logps/rejected": -104.86717224121094, "loss": 0.5857, "rewards/accuracies": 1.0, "rewards/chosen": 3.6839089393615723, "rewards/margins": 0.4887261390686035, "rewards/rejected": 3.1951828002929688, "step": 8925 }, { "epoch": 1.98, "learning_rate": 3.886548709543525e-09, "logits/chosen": -1.9092931747436523, "logits/rejected": -1.8505889177322388, "logps/chosen": -83.0739974975586, "logps/rejected": -34.1829833984375, "loss": 0.0869, "rewards/accuracies": 1.0, "rewards/chosen": 4.8825602531433105, "rewards/margins": 2.513977289199829, "rewards/rejected": 2.3685829639434814, "step": 8926 }, { "epoch": 1.98, "learning_rate": 3.816214339475988e-09, "logits/chosen": -1.788684368133545, "logits/rejected": -1.771672248840332, "logps/chosen": -33.654541015625, "logps/rejected": -56.73493576049805, "loss": 0.6704, "rewards/accuracies": 0.0, "rewards/chosen": 3.297045946121216, "rewards/margins": -1.0372607707977295, "rewards/rejected": 4.334306716918945, "step": 8927 }, { "epoch": 1.98, "learning_rate": 3.746521967014638e-09, "logits/chosen": -1.9856473207473755, "logits/rejected": -1.95948326587677, "logps/chosen": -28.69391632080078, "logps/rejected": -25.12643814086914, "loss": 0.1892, "rewards/accuracies": 1.0, "rewards/chosen": 2.8015668392181396, "rewards/margins": 1.0266931056976318, "rewards/rejected": 1.7748737335205078, "step": 8928 }, { "epoch": 1.98, "learning_rate": 3.677471601114535e-09, "logits/chosen": -2.2150485515594482, "logits/rejected": -2.0993552207946777, "logps/chosen": -85.66324615478516, "logps/rejected": -17.413434982299805, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": 5.796233654022217, "rewards/margins": 4.719499588012695, "rewards/rejected": 1.076733946800232, "step": 8929 }, { "epoch": 1.98, "learning_rate": 3.6090632506485813e-09, "logits/chosen": -1.9736855030059814, "logits/rejected": -1.992517113685608, "logps/chosen": -27.080448150634766, "logps/rejected": -31.111576080322266, "loss": 0.7894, "rewards/accuracies": 0.0, "rewards/chosen": 3.8775737285614014, "rewards/margins": -0.20942473411560059, "rewards/rejected": 4.086998462677002, "step": 8930 }, { "epoch": 1.98, "learning_rate": 3.5412969244069674e-09, "logits/chosen": -1.9979276657104492, "logits/rejected": -1.9923831224441528, "logps/chosen": -48.332420349121094, "logps/rejected": -71.80006408691406, "loss": 0.7606, "rewards/accuracies": 0.0, "rewards/chosen": 3.385784149169922, "rewards/margins": -0.357058048248291, "rewards/rejected": 3.742842197418213, "step": 8931 }, { "epoch": 1.98, "learning_rate": 3.4741726310971725e-09, "logits/chosen": -1.9841340780258179, "logits/rejected": -1.9841340780258179, "logps/chosen": -75.33110046386719, "logps/rejected": -75.33110046386719, "loss": 0.3478, "rewards/accuracies": 0.0, "rewards/chosen": 4.541119575500488, "rewards/margins": 0.0, "rewards/rejected": 4.541119575500488, "step": 8932 }, { "epoch": 1.98, "learning_rate": 3.4076903793450743e-09, "logits/chosen": -1.5730010271072388, "logits/rejected": -1.5430339574813843, "logps/chosen": -49.29096221923828, "logps/rejected": -75.52349853515625, "loss": 0.1117, "rewards/accuracies": 1.0, "rewards/chosen": 4.252835750579834, "rewards/margins": 1.5379126071929932, "rewards/rejected": 2.714923143386841, "step": 8933 }, { "epoch": 1.98, "learning_rate": 3.3418501776938394e-09, "logits/chosen": -1.9782726764678955, "logits/rejected": -1.906304121017456, "logps/chosen": -105.32806396484375, "logps/rejected": -69.50950622558594, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 7.461389064788818, "rewards/margins": 5.0949907302856445, "rewards/rejected": 2.366398572921753, "step": 8934 }, { "epoch": 1.98, "learning_rate": 3.2766520346028118e-09, "logits/chosen": -1.8253923654556274, "logits/rejected": -1.8296797275543213, "logps/chosen": -42.087364196777344, "logps/rejected": -36.750431060791016, "loss": 0.3806, "rewards/accuracies": 1.0, "rewards/chosen": 3.9106833934783936, "rewards/margins": 0.2504274845123291, "rewards/rejected": 3.6602559089660645, "step": 8935 }, { "epoch": 1.98, "learning_rate": 3.212095958449735e-09, "logits/chosen": -2.2252769470214844, "logits/rejected": -2.2388556003570557, "logps/chosen": -114.64118957519531, "logps/rejected": -124.92990112304688, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 11.55701732635498, "rewards/margins": 4.976417064666748, "rewards/rejected": 6.580600261688232, "step": 8936 }, { "epoch": 1.98, "learning_rate": 3.1481819575313045e-09, "logits/chosen": -1.8783605098724365, "logits/rejected": -1.8746947050094604, "logps/chosen": -22.37700080871582, "logps/rejected": -70.94891357421875, "loss": 0.1852, "rewards/accuracies": 1.0, "rewards/chosen": 2.8212077617645264, "rewards/margins": 0.8760936260223389, "rewards/rejected": 1.9451141357421875, "step": 8937 }, { "epoch": 1.98, "learning_rate": 3.0849100400587307e-09, "logits/chosen": -1.974859595298767, "logits/rejected": -1.9549106359481812, "logps/chosen": -133.69859313964844, "logps/rejected": -58.83009338378906, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 7.13270902633667, "rewards/margins": 3.361345052719116, "rewards/rejected": 3.7713639736175537, "step": 8938 }, { "epoch": 1.98, "learning_rate": 3.022280214163287e-09, "logits/chosen": -1.9516708850860596, "logits/rejected": -1.9654171466827393, "logps/chosen": -34.021697998046875, "logps/rejected": -56.52765655517578, "loss": 1.7634, "rewards/accuracies": 0.0, "rewards/chosen": 3.3771302700042725, "rewards/margins": -3.187615156173706, "rewards/rejected": 6.5647454261779785, "step": 8939 }, { "epoch": 1.98, "learning_rate": 2.9602924878918695e-09, "logits/chosen": -1.8884576559066772, "logits/rejected": -1.8387526273727417, "logps/chosen": -99.25474548339844, "logps/rejected": -95.54276275634766, "loss": 0.1786, "rewards/accuracies": 1.0, "rewards/chosen": 8.924960136413574, "rewards/margins": 5.4527482986450195, "rewards/rejected": 3.4722115993499756, "step": 8940 }, { "epoch": 1.98, "learning_rate": 2.8989468692108835e-09, "logits/chosen": -1.9093568325042725, "logits/rejected": -1.7660164833068848, "logps/chosen": -112.13134765625, "logps/rejected": -48.714210510253906, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": 5.664642333984375, "rewards/margins": 3.2335517406463623, "rewards/rejected": 2.4310905933380127, "step": 8941 }, { "epoch": 1.98, "learning_rate": 2.838243366002358e-09, "logits/chosen": -1.901175856590271, "logits/rejected": -1.8987412452697754, "logps/chosen": -55.98460388183594, "logps/rejected": -75.45916748046875, "loss": 0.1794, "rewards/accuracies": 1.0, "rewards/chosen": 4.277261257171631, "rewards/margins": 1.6654295921325684, "rewards/rejected": 2.6118316650390625, "step": 8942 }, { "epoch": 1.98, "learning_rate": 2.7781819860667194e-09, "logits/chosen": -1.9928103685379028, "logits/rejected": -1.9677481651306152, "logps/chosen": -47.54749298095703, "logps/rejected": -77.48799133300781, "loss": 0.542, "rewards/accuracies": 0.0, "rewards/chosen": 5.1665215492248535, "rewards/margins": -0.6477713584899902, "rewards/rejected": 5.814292907714844, "step": 8943 }, { "epoch": 1.98, "learning_rate": 2.718762737121683e-09, "logits/chosen": -1.8648066520690918, "logits/rejected": -1.8955234289169312, "logps/chosen": -55.690521240234375, "logps/rejected": -93.6124267578125, "loss": 1.4989, "rewards/accuracies": 1.0, "rewards/chosen": 4.0498809814453125, "rewards/margins": 0.892941951751709, "rewards/rejected": 3.1569390296936035, "step": 8944 }, { "epoch": 1.98, "learning_rate": 2.6599856268028078e-09, "logits/chosen": -1.680500864982605, "logits/rejected": -1.5352705717086792, "logps/chosen": -28.70378875732422, "logps/rejected": -14.368972778320312, "loss": 0.1939, "rewards/accuracies": 1.0, "rewards/chosen": 1.8324486017227173, "rewards/margins": 0.7813701629638672, "rewards/rejected": 1.05107843875885, "step": 8945 }, { "epoch": 1.98, "learning_rate": 2.6018506626623864e-09, "logits/chosen": -1.7313395738601685, "logits/rejected": -1.780582308769226, "logps/chosen": -25.423080444335938, "logps/rejected": -49.478424072265625, "loss": 1.5453, "rewards/accuracies": 0.0, "rewards/chosen": 2.3724353313446045, "rewards/margins": -2.950394868850708, "rewards/rejected": 5.3228302001953125, "step": 8946 }, { "epoch": 1.98, "learning_rate": 2.5443578521705536e-09, "logits/chosen": -2.0132157802581787, "logits/rejected": -2.0076394081115723, "logps/chosen": -60.16394805908203, "logps/rejected": -64.64186096191406, "loss": 0.139, "rewards/accuracies": 1.0, "rewards/chosen": 3.5181076526641846, "rewards/margins": 1.6251686811447144, "rewards/rejected": 1.8929389715194702, "step": 8947 }, { "epoch": 1.98, "learning_rate": 2.487507202715289e-09, "logits/chosen": -1.7767459154129028, "logits/rejected": -1.7174533605575562, "logps/chosen": -31.95010757446289, "logps/rejected": -33.21928405761719, "loss": 0.323, "rewards/accuracies": 1.0, "rewards/chosen": 3.882582426071167, "rewards/margins": 0.6335649490356445, "rewards/rejected": 3.2490174770355225, "step": 8948 }, { "epoch": 1.98, "learning_rate": 2.431298721601305e-09, "logits/chosen": -1.9954543113708496, "logits/rejected": -1.989877462387085, "logps/chosen": -61.930389404296875, "logps/rejected": -28.76951789855957, "loss": 1.1794, "rewards/accuracies": 0.0, "rewards/chosen": 3.192517042160034, "rewards/margins": -0.09668278694152832, "rewards/rejected": 3.2891998291015625, "step": 8949 }, { "epoch": 1.98, "learning_rate": 2.3757324160522678e-09, "logits/chosen": -1.8658080101013184, "logits/rejected": -1.8953392505645752, "logps/chosen": -39.990821838378906, "logps/rejected": -150.1352996826172, "loss": 0.2987, "rewards/accuracies": 1.0, "rewards/chosen": 3.6671502590179443, "rewards/margins": 1.3753609657287598, "rewards/rejected": 2.2917892932891846, "step": 8950 }, { "epoch": 1.98, "learning_rate": 2.320808293206911e-09, "logits/chosen": -2.036339521408081, "logits/rejected": -2.0429069995880127, "logps/chosen": -57.51396179199219, "logps/rejected": -60.519813537597656, "loss": 0.3955, "rewards/accuracies": 0.0, "rewards/chosen": 3.5757012367248535, "rewards/margins": -0.1485426425933838, "rewards/rejected": 3.7242438793182373, "step": 8951 }, { "epoch": 1.98, "learning_rate": 2.2665263601240328e-09, "logits/chosen": -2.063288927078247, "logits/rejected": -2.01859712600708, "logps/chosen": -35.54722595214844, "logps/rejected": -12.350592613220215, "loss": 0.709, "rewards/accuracies": 1.0, "rewards/chosen": 2.389432668685913, "rewards/margins": 1.1995419263839722, "rewards/rejected": 1.189890742301941, "step": 8952 }, { "epoch": 1.98, "learning_rate": 2.2128866237786095e-09, "logits/chosen": -2.055633783340454, "logits/rejected": -2.0247936248779297, "logps/chosen": -52.147300720214844, "logps/rejected": -40.882083892822266, "loss": 0.4905, "rewards/accuracies": 0.0, "rewards/chosen": 4.341379642486572, "rewards/margins": -0.34235095977783203, "rewards/rejected": 4.683730602264404, "step": 8953 }, { "epoch": 1.98, "learning_rate": 2.1598890910623505e-09, "logits/chosen": -1.8864340782165527, "logits/rejected": -1.8839309215545654, "logps/chosen": -47.464256286621094, "logps/rejected": -62.402435302734375, "loss": 0.1737, "rewards/accuracies": 1.0, "rewards/chosen": 4.410489559173584, "rewards/margins": 1.0620391368865967, "rewards/rejected": 3.3484504222869873, "step": 8954 }, { "epoch": 1.98, "learning_rate": 2.1075337687859186e-09, "logits/chosen": -1.731972575187683, "logits/rejected": -1.670515775680542, "logps/chosen": -73.67584228515625, "logps/rejected": -66.95515441894531, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 5.787452697753906, "rewards/margins": 2.6715285778045654, "rewards/rejected": 3.115924119949341, "step": 8955 }, { "epoch": 1.98, "learning_rate": 2.055820663677266e-09, "logits/chosen": -1.900773286819458, "logits/rejected": -1.4152332544326782, "logps/chosen": -106.51322937011719, "logps/rejected": -124.76399993896484, "loss": 0.1222, "rewards/accuracies": 1.0, "rewards/chosen": 5.809425354003906, "rewards/margins": 1.5598855018615723, "rewards/rejected": 4.249539852142334, "step": 8956 }, { "epoch": 1.98, "learning_rate": 2.0047497823805216e-09, "logits/chosen": -2.1990444660186768, "logits/rejected": -2.191330909729004, "logps/chosen": -139.02389526367188, "logps/rejected": -34.54609680175781, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 8.430679321289062, "rewards/margins": 5.407824516296387, "rewards/rejected": 3.0228545665740967, "step": 8957 }, { "epoch": 1.98, "learning_rate": 1.9543211314587696e-09, "logits/chosen": -1.8769952058792114, "logits/rejected": -1.8396422863006592, "logps/chosen": -83.14567565917969, "logps/rejected": -227.63009643554688, "loss": 0.1189, "rewards/accuracies": 1.0, "rewards/chosen": 10.205824851989746, "rewards/margins": 1.743941307067871, "rewards/rejected": 8.461883544921875, "step": 8958 }, { "epoch": 1.98, "learning_rate": 1.904534717391271e-09, "logits/chosen": -1.6542062759399414, "logits/rejected": -1.6542062759399414, "logps/chosen": -10.566130638122559, "logps/rejected": -10.566130638122559, "loss": 0.3527, "rewards/accuracies": 0.0, "rewards/chosen": 2.5084550380706787, "rewards/margins": 0.0, "rewards/rejected": 2.5084550380706787, "step": 8959 }, { "epoch": 1.98, "learning_rate": 1.8553905465767963e-09, "logits/chosen": -1.5593711137771606, "logits/rejected": -1.5803325176239014, "logps/chosen": -12.68454360961914, "logps/rejected": -28.31597137451172, "loss": 0.4694, "rewards/accuracies": 0.0, "rewards/chosen": 1.7011326551437378, "rewards/margins": -0.2302924394607544, "rewards/rejected": 1.9314250946044922, "step": 8960 }, { "epoch": 1.98, "learning_rate": 1.8068886253286289e-09, "logits/chosen": -1.9848780632019043, "logits/rejected": -1.926591396331787, "logps/chosen": -61.551902770996094, "logps/rejected": -68.82650756835938, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 7.044971466064453, "rewards/margins": 4.23239278793335, "rewards/rejected": 2.8125786781311035, "step": 8961 }, { "epoch": 1.98, "learning_rate": 1.759028959880671e-09, "logits/chosen": -1.9778568744659424, "logits/rejected": -1.9917396306991577, "logps/chosen": -46.41778564453125, "logps/rejected": -78.2544937133789, "loss": 0.0429, "rewards/accuracies": 1.0, "rewards/chosen": 4.520301818847656, "rewards/margins": 2.732748508453369, "rewards/rejected": 1.7875534296035767, "step": 8962 }, { "epoch": 1.98, "learning_rate": 1.7118115563813376e-09, "logits/chosen": -2.1978938579559326, "logits/rejected": -2.195354461669922, "logps/chosen": -32.78495788574219, "logps/rejected": -55.34897994995117, "loss": 0.5923, "rewards/accuracies": 1.0, "rewards/chosen": 3.851001024246216, "rewards/margins": 0.5634541511535645, "rewards/rejected": 3.2875468730926514, "step": 8963 }, { "epoch": 1.98, "learning_rate": 1.6652364208991078e-09, "logits/chosen": -1.831890344619751, "logits/rejected": -1.887612223625183, "logps/chosen": -173.69656372070312, "logps/rejected": -108.38127136230469, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": 10.057089805603027, "rewards/margins": 2.5890612602233887, "rewards/rejected": 7.468028545379639, "step": 8964 }, { "epoch": 1.98, "learning_rate": 1.619303559418639e-09, "logits/chosen": -1.7053215503692627, "logits/rejected": -1.7053215503692627, "logps/chosen": -24.686962127685547, "logps/rejected": -24.686962127685547, "loss": 0.559, "rewards/accuracies": 0.0, "rewards/chosen": 5.741293907165527, "rewards/margins": 0.0, "rewards/rejected": 5.741293907165527, "step": 8965 }, { "epoch": 1.98, "learning_rate": 1.5740129778413215e-09, "logits/chosen": -2.1444993019104004, "logits/rejected": -2.1200530529022217, "logps/chosen": -87.7221908569336, "logps/rejected": -40.429649353027344, "loss": 0.057, "rewards/accuracies": 1.0, "rewards/chosen": 7.2611565589904785, "rewards/margins": 2.5088043212890625, "rewards/rejected": 4.752352237701416, "step": 8966 }, { "epoch": 1.98, "learning_rate": 1.5293646819874997e-09, "logits/chosen": -1.996622920036316, "logits/rejected": -1.976373314857483, "logps/chosen": -54.887054443359375, "logps/rejected": -72.13512420654297, "loss": 0.2941, "rewards/accuracies": 1.0, "rewards/chosen": 5.374776363372803, "rewards/margins": 2.160658121109009, "rewards/rejected": 3.214118242263794, "step": 8967 }, { "epoch": 1.98, "learning_rate": 1.485358677594806e-09, "logits/chosen": -1.6651442050933838, "logits/rejected": -1.6530179977416992, "logps/chosen": -55.16255187988281, "logps/rejected": -81.58944702148438, "loss": 0.4727, "rewards/accuracies": 0.0, "rewards/chosen": 5.613442897796631, "rewards/margins": -0.02312469482421875, "rewards/rejected": 5.63656759262085, "step": 8968 }, { "epoch": 1.99, "learning_rate": 1.441994970317051e-09, "logits/chosen": -2.0259690284729004, "logits/rejected": -2.0548176765441895, "logps/chosen": -57.98208999633789, "logps/rejected": -56.5363883972168, "loss": 0.3246, "rewards/accuracies": 1.0, "rewards/chosen": 3.780529499053955, "rewards/margins": 0.10565876960754395, "rewards/rejected": 3.674870729446411, "step": 8969 }, { "epoch": 1.99, "learning_rate": 1.3992735657269997e-09, "logits/chosen": -1.9819380044937134, "logits/rejected": -1.9433763027191162, "logps/chosen": -55.77017593383789, "logps/rejected": -80.1890869140625, "loss": 0.1053, "rewards/accuracies": 1.0, "rewards/chosen": 8.394854545593262, "rewards/margins": 3.8393397331237793, "rewards/rejected": 4.555514812469482, "step": 8970 }, { "epoch": 1.99, "learning_rate": 1.3571944693135942e-09, "logits/chosen": -1.782858967781067, "logits/rejected": -1.6887425184249878, "logps/chosen": -62.76841735839844, "logps/rejected": -36.65461730957031, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 5.290782928466797, "rewards/margins": 2.693622589111328, "rewards/rejected": 2.5971603393554688, "step": 8971 }, { "epoch": 1.99, "learning_rate": 1.315757686484176e-09, "logits/chosen": -2.0311810970306396, "logits/rejected": -2.0314853191375732, "logps/chosen": -47.026641845703125, "logps/rejected": -56.12038803100586, "loss": 0.1057, "rewards/accuracies": 1.0, "rewards/chosen": 3.7872543334960938, "rewards/margins": 1.592519760131836, "rewards/rejected": 2.194734573364258, "step": 8972 }, { "epoch": 1.99, "learning_rate": 1.2749632225639296e-09, "logits/chosen": -1.8933228254318237, "logits/rejected": -1.8933228254318237, "logps/chosen": -36.028770446777344, "logps/rejected": -36.028770446777344, "loss": 0.3488, "rewards/accuracies": 0.0, "rewards/chosen": 1.6075042486190796, "rewards/margins": 0.0, "rewards/rejected": 1.6075042486190796, "step": 8973 }, { "epoch": 1.99, "learning_rate": 1.2348110827942184e-09, "logits/chosen": -1.5679014921188354, "logits/rejected": -1.462546706199646, "logps/chosen": -22.299938201904297, "logps/rejected": -11.425870895385742, "loss": 0.2019, "rewards/accuracies": 1.0, "rewards/chosen": 2.9453296661376953, "rewards/margins": 0.7167561054229736, "rewards/rejected": 2.2285735607147217, "step": 8974 }, { "epoch": 1.99, "learning_rate": 1.1953012723342483e-09, "logits/chosen": -1.8587384223937988, "logits/rejected": -1.831960916519165, "logps/chosen": -51.659549713134766, "logps/rejected": -71.61734771728516, "loss": 0.3214, "rewards/accuracies": 1.0, "rewards/chosen": 2.830634832382202, "rewards/margins": 0.11978340148925781, "rewards/rejected": 2.7108514308929443, "step": 8975 }, { "epoch": 1.99, "learning_rate": 1.1564337962616245e-09, "logits/chosen": -1.8415300846099854, "logits/rejected": -1.7970393896102905, "logps/chosen": -69.79707336425781, "logps/rejected": -132.287841796875, "loss": 0.3061, "rewards/accuracies": 1.0, "rewards/chosen": 8.924956321716309, "rewards/margins": 0.27801513671875, "rewards/rejected": 8.646941184997559, "step": 8976 }, { "epoch": 1.99, "learning_rate": 1.1182086595701302e-09, "logits/chosen": -1.9996930360794067, "logits/rejected": -2.049311876296997, "logps/chosen": -163.74057006835938, "logps/rejected": -151.6392059326172, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": 10.378897666931152, "rewards/margins": 4.183594226837158, "rewards/rejected": 6.195303440093994, "step": 8977 }, { "epoch": 1.99, "learning_rate": 1.0806258671719473e-09, "logits/chosen": -1.8376859426498413, "logits/rejected": -1.8104878664016724, "logps/chosen": -55.864585876464844, "logps/rejected": -45.04290008544922, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 6.847942352294922, "rewards/margins": 2.886864423751831, "rewards/rejected": 3.961077928543091, "step": 8978 }, { "epoch": 1.99, "learning_rate": 1.0436854238959903e-09, "logits/chosen": -1.8752304315567017, "logits/rejected": -1.7717920541763306, "logps/chosen": -44.84930419921875, "logps/rejected": -11.631582260131836, "loss": 0.0998, "rewards/accuracies": 1.0, "rewards/chosen": 3.2719528675079346, "rewards/margins": 2.109546661376953, "rewards/rejected": 1.1624062061309814, "step": 8979 }, { "epoch": 1.99, "learning_rate": 1.0073873344895735e-09, "logits/chosen": -1.499892234802246, "logits/rejected": -1.5359957218170166, "logps/chosen": -86.16583251953125, "logps/rejected": -153.5543212890625, "loss": 0.7989, "rewards/accuracies": 0.0, "rewards/chosen": 3.936283826828003, "rewards/margins": -0.9464356899261475, "rewards/rejected": 4.88271951675415, "step": 8980 }, { "epoch": 1.99, "learning_rate": 9.717316036167434e-10, "logits/chosen": -2.0200388431549072, "logits/rejected": -1.9297153949737549, "logps/chosen": -126.8873291015625, "logps/rejected": -56.304405212402344, "loss": 0.0833, "rewards/accuracies": 1.0, "rewards/chosen": 6.129843235015869, "rewards/margins": 3.6318199634552, "rewards/rejected": 2.498023271560669, "step": 8981 }, { "epoch": 1.99, "learning_rate": 9.367182358588355e-10, "logits/chosen": -2.192023992538452, "logits/rejected": -1.715438723564148, "logps/chosen": -49.4676513671875, "logps/rejected": -73.7909164428711, "loss": 0.5602, "rewards/accuracies": 0.0, "rewards/chosen": 3.938786268234253, "rewards/margins": -0.6470706462860107, "rewards/rejected": 4.585856914520264, "step": 8982 }, { "epoch": 1.99, "learning_rate": 9.023472357155838e-10, "logits/chosen": -2.264455795288086, "logits/rejected": -2.232696771621704, "logps/chosen": -101.30569458007812, "logps/rejected": -54.618343353271484, "loss": 0.1365, "rewards/accuracies": 1.0, "rewards/chosen": 9.430511474609375, "rewards/margins": 1.5310940742492676, "rewards/rejected": 7.899417400360107, "step": 8983 }, { "epoch": 1.99, "learning_rate": 8.686186076029001e-10, "logits/chosen": -2.111473798751831, "logits/rejected": -2.1297426223754883, "logps/chosen": -35.047298431396484, "logps/rejected": -55.51557922363281, "loss": 0.4095, "rewards/accuracies": 1.0, "rewards/chosen": 5.070183277130127, "rewards/margins": 0.8833303451538086, "rewards/rejected": 4.186852931976318, "step": 8984 }, { "epoch": 1.99, "learning_rate": 8.355323558550954e-10, "logits/chosen": -1.7778111696243286, "logits/rejected": -1.836303472518921, "logps/chosen": -39.739234924316406, "logps/rejected": -91.23713684082031, "loss": 0.1888, "rewards/accuracies": 1.0, "rewards/chosen": 4.941404819488525, "rewards/margins": 0.9039888381958008, "rewards/rejected": 4.037415981292725, "step": 8985 }, { "epoch": 1.99, "learning_rate": 8.030884847237686e-10, "logits/chosen": -2.1000380516052246, "logits/rejected": -2.1213512420654297, "logps/chosen": -68.00521850585938, "logps/rejected": -160.9918212890625, "loss": 0.0647, "rewards/accuracies": 1.0, "rewards/chosen": 9.571978569030762, "rewards/margins": 2.7546567916870117, "rewards/rejected": 6.81732177734375, "step": 8986 }, { "epoch": 1.99, "learning_rate": 7.712869983778071e-10, "logits/chosen": -2.0968427658081055, "logits/rejected": -2.1050775051116943, "logps/chosen": -44.053855895996094, "logps/rejected": -57.3209228515625, "loss": 1.5194, "rewards/accuracies": 1.0, "rewards/chosen": 2.684001922607422, "rewards/margins": 0.20260930061340332, "rewards/rejected": 2.4813926219940186, "step": 8987 }, { "epoch": 1.99, "learning_rate": 7.401279009039419e-10, "logits/chosen": -1.9672659635543823, "logits/rejected": -1.9050633907318115, "logps/chosen": -193.42942810058594, "logps/rejected": -123.91334533691406, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 11.909868240356445, "rewards/margins": 2.434910774230957, "rewards/rejected": 9.474957466125488, "step": 8988 }, { "epoch": 1.99, "learning_rate": 7.096111963056373e-10, "logits/chosen": -1.8776181936264038, "logits/rejected": -1.7706912755966187, "logps/chosen": -40.554466247558594, "logps/rejected": -11.016648292541504, "loss": 0.2752, "rewards/accuracies": 1.0, "rewards/chosen": 3.7804391384124756, "rewards/margins": 3.041914463043213, "rewards/rejected": 0.7385245561599731, "step": 8989 }, { "epoch": 1.99, "learning_rate": 6.797368885042011e-10, "logits/chosen": -1.8948678970336914, "logits/rejected": -1.8749713897705078, "logps/chosen": -47.96329116821289, "logps/rejected": -52.063880920410156, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 3.5429654121398926, "rewards/margins": 1.425159215927124, "rewards/rejected": 2.1178061962127686, "step": 8990 }, { "epoch": 1.99, "learning_rate": 6.505049813387843e-10, "logits/chosen": -1.6687450408935547, "logits/rejected": -1.4907270669937134, "logps/chosen": -43.542938232421875, "logps/rejected": -7.171756267547607, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": 3.360945224761963, "rewards/margins": 2.3514161109924316, "rewards/rejected": 1.0095289945602417, "step": 8991 }, { "epoch": 1.99, "learning_rate": 6.219154785652714e-10, "logits/chosen": -2.2674896717071533, "logits/rejected": -2.191070079803467, "logps/chosen": -112.21137237548828, "logps/rejected": -65.1571044921875, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": 7.312746524810791, "rewards/margins": 2.613837242126465, "rewards/rejected": 4.698909282684326, "step": 8992 }, { "epoch": 1.99, "learning_rate": 5.939683838568356e-10, "logits/chosen": -1.8653876781463623, "logits/rejected": -1.9300165176391602, "logps/chosen": -55.06265640258789, "logps/rejected": -76.3968734741211, "loss": 1.6426, "rewards/accuracies": 0.0, "rewards/chosen": 6.236013412475586, "rewards/margins": -3.2439165115356445, "rewards/rejected": 9.47992992401123, "step": 8993 }, { "epoch": 1.99, "learning_rate": 5.666637008061582e-10, "logits/chosen": -2.202644109725952, "logits/rejected": -2.121720314025879, "logps/chosen": -108.46714782714844, "logps/rejected": -56.31147766113281, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": 6.605473518371582, "rewards/margins": 4.018895149230957, "rewards/rejected": 2.586578369140625, "step": 8994 }, { "epoch": 1.99, "learning_rate": 5.400014329204339e-10, "logits/chosen": -1.5803130865097046, "logits/rejected": -1.5803130865097046, "logps/chosen": -18.352781295776367, "logps/rejected": -18.352781295776367, "loss": 0.4193, "rewards/accuracies": 0.0, "rewards/chosen": 2.224252462387085, "rewards/margins": 0.0, "rewards/rejected": 2.224252462387085, "step": 8995 }, { "epoch": 1.99, "learning_rate": 5.139815836263661e-10, "logits/chosen": -1.8569974899291992, "logits/rejected": -1.7930375337600708, "logps/chosen": -74.78485870361328, "logps/rejected": -169.95303344726562, "loss": 0.7076, "rewards/accuracies": 0.0, "rewards/chosen": 8.26437759399414, "rewards/margins": -0.9788389205932617, "rewards/rejected": 9.243216514587402, "step": 8996 }, { "epoch": 1.99, "learning_rate": 4.886041562668365e-10, "logits/chosen": -1.8273974657058716, "logits/rejected": -1.8294304609298706, "logps/chosen": -5.826377868652344, "logps/rejected": -20.473346710205078, "loss": 0.2521, "rewards/accuracies": 1.0, "rewards/chosen": 1.2614433765411377, "rewards/margins": 0.5141014456748962, "rewards/rejected": 0.7473419308662415, "step": 8997 }, { "epoch": 1.99, "learning_rate": 4.6386915410312485e-10, "logits/chosen": -1.9940683841705322, "logits/rejected": -1.9727458953857422, "logps/chosen": -55.52812194824219, "logps/rejected": -61.125099182128906, "loss": 0.7495, "rewards/accuracies": 0.0, "rewards/chosen": 4.159177303314209, "rewards/margins": -1.201979160308838, "rewards/rejected": 5.361156463623047, "step": 8998 }, { "epoch": 1.99, "learning_rate": 4.39776580314355e-10, "logits/chosen": -1.995108723640442, "logits/rejected": -1.9856228828430176, "logps/chosen": -63.86647033691406, "logps/rejected": -124.17842864990234, "loss": 0.121, "rewards/accuracies": 1.0, "rewards/chosen": 9.542207717895508, "rewards/margins": 1.3589000701904297, "rewards/rejected": 8.183307647705078, "step": 8999 }, { "epoch": 1.99, "learning_rate": 4.163264379952736e-10, "logits/chosen": -1.726115107536316, "logits/rejected": -1.6178234815597534, "logps/chosen": -28.93879508972168, "logps/rejected": -61.14751434326172, "loss": 1.4152, "rewards/accuracies": 0.0, "rewards/chosen": 4.89430570602417, "rewards/margins": -2.6509876251220703, "rewards/rejected": 7.54529333114624, "step": 9000 }, { "epoch": 1.99, "learning_rate": 3.935187301595811e-10, "logits/chosen": -1.8727377653121948, "logits/rejected": -1.8308383226394653, "logps/chosen": -71.69876098632812, "logps/rejected": -60.669044494628906, "loss": 0.3632, "rewards/accuracies": 0.0, "rewards/chosen": 4.859004974365234, "rewards/margins": -0.05975961685180664, "rewards/rejected": 4.918764591217041, "step": 9001 }, { "epoch": 1.99, "learning_rate": 3.713534597377111e-10, "logits/chosen": -2.2518625259399414, "logits/rejected": -2.2518625259399414, "logps/chosen": -36.2708740234375, "logps/rejected": -36.2708740234375, "loss": 0.3944, "rewards/accuracies": 0.0, "rewards/chosen": 7.836267948150635, "rewards/margins": 0.0, "rewards/rejected": 7.836267948150635, "step": 9002 }, { "epoch": 1.99, "learning_rate": 3.498306295784959e-10, "logits/chosen": -2.0700039863586426, "logits/rejected": -2.0734879970550537, "logps/chosen": -47.766563415527344, "logps/rejected": -83.723876953125, "loss": 0.3607, "rewards/accuracies": 1.0, "rewards/chosen": 3.310636281967163, "rewards/margins": 1.2553491592407227, "rewards/rejected": 2.0552871227264404, "step": 9003 }, { "epoch": 1.99, "learning_rate": 3.2895024244750105e-10, "logits/chosen": -2.1825971603393555, "logits/rejected": -2.1503822803497314, "logps/chosen": -74.07563781738281, "logps/rejected": -62.50950622558594, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": 8.02635669708252, "rewards/margins": 5.181241035461426, "rewards/rejected": 2.8451156616210938, "step": 9004 }, { "epoch": 1.99, "learning_rate": 3.0871230102702545e-10, "logits/chosen": -1.959197998046875, "logits/rejected": -1.8602118492126465, "logps/chosen": -91.55009460449219, "logps/rejected": -55.689666748046875, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 8.23809814453125, "rewards/margins": 5.163022518157959, "rewards/rejected": 3.075075626373291, "step": 9005 }, { "epoch": 1.99, "learning_rate": 2.8911680791832155e-10, "logits/chosen": -2.142435073852539, "logits/rejected": -2.102376937866211, "logps/chosen": -90.67572021484375, "logps/rejected": -29.23748016357422, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": 9.09281063079834, "rewards/margins": 3.428962230682373, "rewards/rejected": 5.663848400115967, "step": 9006 }, { "epoch": 1.99, "learning_rate": 2.7016376563882005e-10, "logits/chosen": -1.9917103052139282, "logits/rejected": -1.9667762517929077, "logps/chosen": -35.29713439941406, "logps/rejected": -76.45161437988281, "loss": 0.3742, "rewards/accuracies": 0.0, "rewards/chosen": 3.3220460414886475, "rewards/margins": -0.07016587257385254, "rewards/rejected": 3.3922119140625, "step": 9007 }, { "epoch": 1.99, "learning_rate": 2.5185317662490547e-10, "logits/chosen": -2.4175567626953125, "logits/rejected": -2.4307961463928223, "logps/chosen": -67.71538543701172, "logps/rejected": -78.60039520263672, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 8.033219337463379, "rewards/margins": 4.926248550415039, "rewards/rejected": 3.106971025466919, "step": 9008 }, { "epoch": 1.99, "learning_rate": 2.3418504322803013e-10, "logits/chosen": -1.9258105754852295, "logits/rejected": -1.9467390775680542, "logps/chosen": -33.03569030761719, "logps/rejected": -79.91484069824219, "loss": 1.9597, "rewards/accuracies": 1.0, "rewards/chosen": 4.636348724365234, "rewards/margins": 0.5023946762084961, "rewards/rejected": 4.133954048156738, "step": 9009 }, { "epoch": 1.99, "learning_rate": 2.1715936771971037e-10, "logits/chosen": -2.0170631408691406, "logits/rejected": -2.02260422706604, "logps/chosen": -159.27334594726562, "logps/rejected": -119.71544647216797, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": 9.635272026062012, "rewards/margins": 3.526604175567627, "rewards/rejected": 6.108667850494385, "step": 9010 }, { "epoch": 1.99, "learning_rate": 2.0077615228708547e-10, "logits/chosen": -1.9690659046173096, "logits/rejected": -1.8713359832763672, "logps/chosen": -108.40137481689453, "logps/rejected": -28.94768524169922, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 6.128670692443848, "rewards/margins": 4.539787292480469, "rewards/rejected": 1.588883638381958, "step": 9011 }, { "epoch": 1.99, "learning_rate": 1.8503539903569347e-10, "logits/chosen": -2.085184097290039, "logits/rejected": -2.0151844024658203, "logps/chosen": -52.63902282714844, "logps/rejected": -14.114912033081055, "loss": 0.3307, "rewards/accuracies": 1.0, "rewards/chosen": 2.1571602821350098, "rewards/margins": 1.1430374383926392, "rewards/rejected": 1.0141228437423706, "step": 9012 }, { "epoch": 1.99, "learning_rate": 1.6993710998836067e-10, "logits/chosen": -2.250133991241455, "logits/rejected": -2.1962716579437256, "logps/chosen": -60.83942413330078, "logps/rejected": -40.06322479248047, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 7.4942803382873535, "rewards/margins": 5.122217178344727, "rewards/rejected": 2.372063398361206, "step": 9013 }, { "epoch": 2.0, "learning_rate": 1.554812870840916e-10, "logits/chosen": -1.7681708335876465, "logits/rejected": -1.8598928451538086, "logps/chosen": -47.15843963623047, "logps/rejected": -174.3072509765625, "loss": 2.3181, "rewards/accuracies": 0.0, "rewards/chosen": 3.7808678150177, "rewards/margins": -4.563072204589844, "rewards/rejected": 8.343939781188965, "step": 9014 }, { "epoch": 2.0, "learning_rate": 1.4166793218195475e-10, "logits/chosen": -1.835186243057251, "logits/rejected": -1.7845712900161743, "logps/chosen": -53.90337371826172, "logps/rejected": -57.09822082519531, "loss": 0.1209, "rewards/accuracies": 1.0, "rewards/chosen": 4.745755672454834, "rewards/margins": 1.302722692489624, "rewards/rejected": 3.44303297996521, "step": 9015 }, { "epoch": 2.0, "learning_rate": 1.2849704705553135e-10, "logits/chosen": -2.0724644660949707, "logits/rejected": -2.087311267852783, "logps/chosen": -26.074037551879883, "logps/rejected": -86.67852783203125, "loss": 0.569, "rewards/accuracies": 0.0, "rewards/chosen": 4.139822483062744, "rewards/margins": -0.5982847213745117, "rewards/rejected": 4.738107204437256, "step": 9016 }, { "epoch": 2.0, "learning_rate": 1.1596863339846665e-10, "logits/chosen": -1.8318099975585938, "logits/rejected": -1.8243030309677124, "logps/chosen": -35.49568176269531, "logps/rejected": -54.238162994384766, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": 5.446922302246094, "rewards/margins": -1.0430054664611816, "rewards/rejected": 6.489927768707275, "step": 9017 }, { "epoch": 2.0, "learning_rate": 1.040826928194738e-10, "logits/chosen": -1.9301061630249023, "logits/rejected": -1.8847739696502686, "logps/chosen": -95.52564239501953, "logps/rejected": -129.26361083984375, "loss": 0.1941, "rewards/accuracies": 1.0, "rewards/chosen": 9.177572250366211, "rewards/margins": 1.7371973991394043, "rewards/rejected": 7.440374851226807, "step": 9018 }, { "epoch": 2.0, "learning_rate": 9.283922684677482e-11, "logits/chosen": -2.2620885372161865, "logits/rejected": -2.260057210922241, "logps/chosen": -59.36563491821289, "logps/rejected": -80.1159439086914, "loss": 0.2871, "rewards/accuracies": 1.0, "rewards/chosen": 4.956571578979492, "rewards/margins": 0.9838366508483887, "rewards/rejected": 3.9727349281311035, "step": 9019 }, { "epoch": 2.0, "learning_rate": 8.223823692476984e-11, "logits/chosen": -1.915190577507019, "logits/rejected": -1.9161518812179565, "logps/chosen": -44.728397369384766, "logps/rejected": -68.88528442382812, "loss": 0.1652, "rewards/accuracies": 1.0, "rewards/chosen": 3.722501754760742, "rewards/margins": 0.9409663677215576, "rewards/rejected": 2.7815353870391846, "step": 9020 }, { "epoch": 2.0, "learning_rate": 7.227972441570253e-11, "logits/chosen": -1.926875352859497, "logits/rejected": -1.937692403793335, "logps/chosen": -41.02717590332031, "logps/rejected": -75.81425476074219, "loss": 0.1482, "rewards/accuracies": 1.0, "rewards/chosen": 5.3219804763793945, "rewards/margins": 1.5710434913635254, "rewards/rejected": 3.750936985015869, "step": 9021 }, { "epoch": 2.0, "learning_rate": 6.296369059854978e-11, "logits/chosen": -2.051802396774292, "logits/rejected": -2.0741069316864014, "logps/chosen": -46.86396789550781, "logps/rejected": -61.056114196777344, "loss": 0.25, "rewards/accuracies": 1.0, "rewards/chosen": 3.2641053199768066, "rewards/margins": 0.8180809020996094, "rewards/rejected": 2.4460244178771973, "step": 9022 }, { "epoch": 2.0, "learning_rate": 5.4290136671797386e-11, "logits/chosen": -2.057525634765625, "logits/rejected": -1.9997001886367798, "logps/chosen": -60.44759750366211, "logps/rejected": -83.84191131591797, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 5.778094291687012, "rewards/margins": 2.8729398250579834, "rewards/rejected": 2.9051544666290283, "step": 9023 }, { "epoch": 2.0, "learning_rate": 4.625906374899902e-11, "logits/chosen": -1.7074732780456543, "logits/rejected": -1.501811146736145, "logps/chosen": -17.138357162475586, "logps/rejected": -78.93051147460938, "loss": 2.4024, "rewards/accuracies": 0.0, "rewards/chosen": 1.7266149520874023, "rewards/margins": -4.796497821807861, "rewards/rejected": 6.523112773895264, "step": 9024 }, { "epoch": 2.0, "learning_rate": 3.8870472862662103e-11, "logits/chosen": -1.7808960676193237, "logits/rejected": -1.7620198726654053, "logps/chosen": -72.30445861816406, "logps/rejected": -98.92459106445312, "loss": 0.1119, "rewards/accuracies": 1.0, "rewards/chosen": 12.003666877746582, "rewards/margins": 1.4323139190673828, "rewards/rejected": 10.5713529586792, "step": 9025 }, { "epoch": 2.0, "learning_rate": 3.2124364961472196e-11, "logits/chosen": -2.0934221744537354, "logits/rejected": -2.066359519958496, "logps/chosen": -34.76491928100586, "logps/rejected": -47.2774658203125, "loss": 0.2114, "rewards/accuracies": 1.0, "rewards/chosen": 3.494211196899414, "rewards/margins": 0.6861331462860107, "rewards/rejected": 2.8080780506134033, "step": 9026 }, { "epoch": 2.0, "learning_rate": 2.6020740912513497e-11, "logits/chosen": -1.8577765226364136, "logits/rejected": -1.8577765226364136, "logps/chosen": -39.7625732421875, "logps/rejected": -39.7625732421875, "loss": 0.3868, "rewards/accuracies": 0.0, "rewards/chosen": 2.600963592529297, "rewards/margins": 0.0, "rewards/rejected": 2.600963592529297, "step": 9027 }, { "epoch": 2.0, "learning_rate": 2.0559601500713677e-11, "logits/chosen": -1.8736644983291626, "logits/rejected": -1.9091483354568481, "logps/chosen": -50.29800033569336, "logps/rejected": -188.71865844726562, "loss": 0.1387, "rewards/accuracies": 1.0, "rewards/chosen": 10.189048767089844, "rewards/margins": 1.1750411987304688, "rewards/rejected": 9.014007568359375, "step": 9028 }, { "epoch": 2.0, "learning_rate": 1.5740947427178576e-11, "logits/chosen": -1.7591668367385864, "logits/rejected": -1.6434578895568848, "logps/chosen": -47.61248779296875, "logps/rejected": -32.521095275878906, "loss": 0.2734, "rewards/accuracies": 1.0, "rewards/chosen": 3.3703248500823975, "rewards/margins": 0.8382809162139893, "rewards/rejected": 2.532043933868408, "step": 9029 }, { "epoch": 2.0, "learning_rate": 1.1564779311412644e-11, "logits/chosen": -2.0414493083953857, "logits/rejected": -2.048335313796997, "logps/chosen": -37.99422073364258, "logps/rejected": -71.39813232421875, "loss": 0.6838, "rewards/accuracies": 0.0, "rewards/chosen": 4.378303050994873, "rewards/margins": -0.0380096435546875, "rewards/rejected": 4.4163126945495605, "step": 9030 }, { "epoch": 2.0, "learning_rate": 8.03109769020871e-12, "logits/chosen": -1.9428455829620361, "logits/rejected": -1.8693236112594604, "logps/chosen": -43.47581481933594, "logps/rejected": -63.94707489013672, "loss": 0.6837, "rewards/accuracies": 0.0, "rewards/chosen": 4.2823638916015625, "rewards/margins": -0.2865424156188965, "rewards/rejected": 4.568906307220459, "step": 9031 }, { "epoch": 2.0, "learning_rate": 5.1399030170928845e-12, "logits/chosen": -1.7169865369796753, "logits/rejected": -1.6943784952163696, "logps/chosen": -85.71720123291016, "logps/rejected": -155.57122802734375, "loss": 1.712, "rewards/accuracies": 0.0, "rewards/chosen": 8.553238868713379, "rewards/margins": -2.771230697631836, "rewards/rejected": 11.324469566345215, "step": 9032 }, { "epoch": 2.0, "learning_rate": 2.8911956639898762e-12, "logits/chosen": -1.8930621147155762, "logits/rejected": -1.8930621147155762, "logps/chosen": -39.33217239379883, "logps/rejected": -39.33217239379883, "loss": 0.3537, "rewards/accuracies": 0.0, "rewards/chosen": 4.29762601852417, "rewards/margins": 0.0, "rewards/rejected": 4.29762601852417, "step": 9033 }, { "epoch": 2.0, "learning_rate": 1.284975919557674e-12, "logits/chosen": -1.7923121452331543, "logits/rejected": -1.768784761428833, "logps/chosen": -37.092063903808594, "logps/rejected": -33.0675048828125, "loss": 0.4993, "rewards/accuracies": 0.0, "rewards/chosen": 3.9307289123535156, "rewards/margins": -0.1927652359008789, "rewards/rejected": 4.1234941482543945, "step": 9034 }, { "epoch": 2.0, "learning_rate": 3.2124399029775933e-13, "logits/chosen": -1.8891220092773438, "logits/rejected": -1.8810986280441284, "logps/chosen": -58.75425720214844, "logps/rejected": -101.72372436523438, "loss": 0.1491, "rewards/accuracies": 1.0, "rewards/chosen": 4.738946437835693, "rewards/margins": 1.2092177867889404, "rewards/rejected": 3.529728651046753, "step": 9035 }, { "epoch": 2.0, "learning_rate": 0.0, "logits/chosen": -2.062861919403076, "logits/rejected": -2.07110857963562, "logps/chosen": -53.66800308227539, "logps/rejected": -32.602134704589844, "loss": 0.546, "rewards/accuracies": 0.0, "rewards/chosen": 3.165888547897339, "rewards/margins": -0.6017856597900391, "rewards/rejected": 3.767674207687378, "step": 9036 }, { "epoch": 2.0, "step": 9036, "total_flos": 0.0, "train_loss": 0.6283123089440161, "train_runtime": 39110.9791, "train_samples_per_second": 0.462, "train_steps_per_second": 0.231 } ], "max_steps": 9036, "num_train_epochs": 2, "total_flos": 0.0, "trial_name": null, "trial_params": null }